sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003,2004,2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. All advertising materials mentioning features or use of this software
  49  *    must display the following acknowledgement:
  50  *      This product includes software developed by the University of
  51  *      California, Berkeley and its contributors.
  52  * 4. Neither the name of the University nor the names of its contributors
  53  *    may be used to endorse or promote products derived from this software
  54  *    without specific prior written permission.
  55  *
  56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  66  * SUCH DAMAGE.
  67  */
  68
  69 #include <sys/param.h>
  70 #include <sys/systm.h>
  71 #include <sys/kernel.h>
  72 #include <sys/sysctl.h>
  73 #include <sys/mount.h>
  74 #include <sys/vnode.h>
  75 #include <sys/malloc.h>
  76 #include <sys/sysproto.h>
  77 #include <sys/spinlock.h>
  78 #include <sys/proc.h>
  79 #include <sys/namei.h>
  80 #include <sys/nlookup.h>
  81 #include <sys/filedesc.h>
  82 #include <sys/fnv_hash.h>
  83 #include <sys/globaldata.h>
  84 #include <sys/kern_syscall.h>
  85 #include <sys/dirent.h>
  86 #include <ddb/ddb.h>
  87
  88 #include <sys/sysref2.h>
  89 #include <sys/spinlock2.h>
  90 #include <sys/mplock2.h>
  91
  92 #define MAX_RECURSION_DEPTH     64
  93
  94 /*
  95  * Random lookups in the cache are accomplished with a hash table using
  96  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock.
  97  *
  98  * Negative entries may exist and correspond to resolved namecache
  99  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
 100  * will be set if the entry corresponds to a whited-out directory entry
 101  * (verses simply not finding the entry at all).   ncneglist is locked
 102  * with a global spinlock (ncspin).
 103  *
 104  * MPSAFE RULES:
 105  *
 106  * (1) A ncp must be referenced before it can be locked.
 107  *
 108  * (2) A ncp must be locked in order to modify it.
 109  *
 110  * (3) ncp locks are always ordered child -> parent.  That may seem
 111  *     backwards but forward scans use the hash table and thus can hold
 112  *     the parent unlocked when traversing downward.
 113  *
 114  *     This allows insert/rename/delete/dot-dot and other operations
 115  *     to use ncp->nc_parent links.
 116  *
 117  *     This also prevents a locked up e.g. NFS node from creating a
 118  *     chain reaction all the way back to the root vnode / namecache.
 119  *
 120  * (4) parent linkages require both the parent and child to be locked.
 121  */
 122
 123 /*
 124  * Structures associated with name cacheing.
 125  */
 126 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 127 #define MINNEG                  1024
 128 #define MINPOS                  1024
 129 #define NCMOUNT_NUMCACHE        1009    /* prime number */
 130
 131 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 132
 133 LIST_HEAD(nchash_list, namecache);
 134
 135 struct nchash_head {
 136        struct nchash_list list;
 137        struct spinlock  spin;
 138 };
 139
 140 struct ncmount_cache {
 141         struct spinlock spin;
 142         struct namecache *ncp;
 143         struct mount *mp;
 144         int isneg;              /* if != 0 mp is originator and not target */
 145 };
 146
 147 static struct nchash_head       *nchashtbl;
 148 static struct namecache_list    ncneglist;
 149 static struct spinlock          ncspin;
 150 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 151
 152 /*
 153  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 154  * to create the namecache infrastructure leading to a dangling vnode.
 155  *
 156  * 0    Only errors are reported
 157  * 1    Successes are reported
 158  * 2    Successes + the whole directory scan is reported
 159  * 3    Force the directory scan code run as if the parent vnode did not
 160  *      have a namecache record, even if it does have one.
 161  */
 162 static int      ncvp_debug;
 163 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 164     "Namecache debug level (0-3)");
 165
 166 static u_long   nchash;                 /* size of hash table */
 167 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 168     "Size of namecache hash table");
 169
 170 static int      ncnegfactor = 16;       /* ratio of negative entries */
 171 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 172     "Ratio of namecache negative entries");
 173
 174 static int      nclockwarn;             /* warn on locked entries in ticks */
 175 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 176     "Warn on locked namecache entries in ticks");
 177
 178 static int      numdefered;             /* number of cache entries allocated */
 179 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 180     "Number of cache entries allocated");
 181
 182 static int      ncposlimit;             /* number of cache entries allocated */
 183 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 184     "Number of cache entries allocated");
 185
 186 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 187     "sizeof(struct vnode)");
 188 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 189     "sizeof(struct namecache)");
 190
 191 static int      ncmount_cache_enable = 1;
 192 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 193            &ncmount_cache_enable, 0, "mount point cache");
 194 static long     ncmount_cache_hit;
 195 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_hit, CTLFLAG_RW,
 196             &ncmount_cache_hit, 0, "mpcache hits");
 197 static long     ncmount_cache_miss;
 198 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_miss, CTLFLAG_RW,
 199             &ncmount_cache_miss, 0, "mpcache misses");
 200 static long     ncmount_cache_overwrite;
 201 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW,
 202             &ncmount_cache_overwrite, 0, "mpcache entry overwrites");
 203
 204 static int cache_resolve_mp(struct mount *mp);
 205 static struct vnode *cache_dvpref(struct namecache *ncp);
 206 static void _cache_lock(struct namecache *ncp);
 207 static void _cache_setunresolved(struct namecache *ncp);
 208 static void _cache_cleanneg(int count);
 209 static void _cache_cleanpos(int count);
 210 static void _cache_cleandefered(void);
 211 static void _cache_unlink(struct namecache *ncp);
 212
 213 /*
 214  * The new name cache statistics
 215  */
 216 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 217 static int numneg;
 218 SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
 219     "Number of negative namecache entries");
 220 static int numcache;
 221 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
 222     "Number of namecaches entries");
 223 static u_long numcalls;
 224 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0,
 225     "Number of namecache lookups");
 226 static u_long numchecks;
 227 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0,
 228     "Number of checked entries in namecache lookups");
 229
 230 struct nchstats nchstats[SMP_MAXCPU];
 231 /*
 232  * Export VFS cache effectiveness statistics to user-land.
 233  *
 234  * The statistics are left for aggregation to user-land so
 235  * neat things can be achieved, like observing per-CPU cache
 236  * distribution.
 237  */
 238 static int
 239 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 240 {
 241         struct globaldata *gd;
 242         int i, error;
 243
 244         error = 0;
 245         for (i = 0; i < ncpus; ++i) {
 246                 gd = globaldata_find(i);
 247                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 248                         sizeof(struct nchstats))))
 249                         break;
 250         }
 251
 252         return (error);
 253 }
 254 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 255   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 256
 257 static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
 258
 259 /*
 260  * Namespace locking.  The caller must already hold a reference to the
 261  * namecache structure in order to lock/unlock it.  This function prevents
 262  * the namespace from being created or destroyed by accessors other then
 263  * the lock holder.
 264  *
 265  * Note that holding a locked namecache structure prevents other threads
 266  * from making namespace changes (e.g. deleting or creating), prevents
 267  * vnode association state changes by other threads, and prevents the
 268  * namecache entry from being resolved or unresolved by other threads.
 269  *
 270  * The lock owner has full authority to associate/disassociate vnodes
 271  * and resolve/unresolve the locked ncp.
 272  *
 273  * The primary lock field is nc_exlocks.  nc_locktd is set after the
 274  * fact (when locking) or cleared prior to unlocking.
 275  *
 276  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 277  *           or recycled, but it does NOT help you if the vnode had already
 278  *           initiated a recyclement.  If this is important, use cache_get()
 279  *           rather then cache_lock() (and deal with the differences in the
 280  *           way the refs counter is handled).  Or, alternatively, make an
 281  *           unconditional call to cache_validate() or cache_resolve()
 282  *           after cache_lock() returns.
 283  *
 284  * MPSAFE
 285  */
 286 static
 287 void
 288 _cache_lock(struct namecache *ncp)
 289 {
 290         thread_t td;
 291         int didwarn;
 292         int error;
 293         u_int count;
 294
 295         KKASSERT(ncp->nc_refs != 0);
 296         didwarn = 0;
 297         td = curthread;
 298
 299         for (;;) {
 300                 count = ncp->nc_exlocks;
 301
 302                 if (count == 0) {
 303                         if (atomic_cmpset_int(&ncp->nc_exlocks, 0, 1)) {
 304                                 /*
 305                                  * The vp associated with a locked ncp must
 306                                  * be held to prevent it from being recycled.
 307                                  *
 308                                  * WARNING!  If VRECLAIMED is set the vnode
 309                                  * could already be in the middle of a recycle.
 310                                  * Callers must use cache_vref() or
 311                                  * cache_vget() on the locked ncp to
 312                                  * validate the vp or set the cache entry
 313                                  * to unresolved.
 314                                  *
 315                                  * NOTE! vhold() is allowed if we hold a
 316                                  *       lock on the ncp (which we do).
 317                                  */
 318                                 ncp->nc_locktd = td;
 319                                 if (ncp->nc_vp)
 320                                         vhold(ncp->nc_vp);      /* MPSAFE */
 321                                 break;
 322                         }
 323                         /* cmpset failed */
 324                         continue;
 325                 }
 326                 if (ncp->nc_locktd == td) {
 327                         if (atomic_cmpset_int(&ncp->nc_exlocks, count,
 328                                               count + 1)) {
 329                                 break;
 330                         }
 331                         /* cmpset failed */
 332                         continue;
 333                 }
 334                 tsleep_interlock(ncp, 0);
 335                 if (atomic_cmpset_int(&ncp->nc_exlocks, count,
 336                                       count | NC_EXLOCK_REQ) == 0) {
 337                         /* cmpset failed */
 338                         continue;
 339                 }
 340                 error = tsleep(ncp, PINTERLOCKED, "clock", nclockwarn);
 341                 if (error == EWOULDBLOCK) {
 342                         if (didwarn == 0) {
 343                                 didwarn = ticks;
 344                                 kprintf("[diagnostic] cache_lock: blocked "
 345                                         "on %p",
 346                                         ncp);
 347                                 kprintf(" \"%*.*s\"\n",
 348                                         ncp->nc_nlen, ncp->nc_nlen,
 349                                         ncp->nc_name);
 350                         }
 351                 }
 352         }
 353         if (didwarn) {
 354                 kprintf("[diagnostic] cache_lock: unblocked %*.*s after "
 355                         "%d secs\n",
 356                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 357                         (int)(ticks - didwarn) / hz);
 358         }
 359 }
 360
 361 /*
 362  * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
 363  *       such as the case where one of its children is locked.
 364  *
 365  * MPSAFE
 366  */
 367 static
 368 int
 369 _cache_lock_nonblock(struct namecache *ncp)
 370 {
 371         thread_t td;
 372         u_int count;
 373
 374         td = curthread;
 375
 376         for (;;) {
 377                 count = ncp->nc_exlocks;
 378
 379                 if (count == 0) {
 380                         if (atomic_cmpset_int(&ncp->nc_exlocks, 0, 1)) {
 381                                 /*
 382                                  * The vp associated with a locked ncp must
 383                                  * be held to prevent it from being recycled.
 384                                  *
 385                                  * WARNING!  If VRECLAIMED is set the vnode
 386                                  * could already be in the middle of a recycle.
 387                                  * Callers must use cache_vref() or
 388                                  * cache_vget() on the locked ncp to
 389                                  * validate the vp or set the cache entry
 390                                  * to unresolved.
 391                                  *
 392                                  * NOTE! vhold() is allowed if we hold a
 393                                  *       lock on the ncp (which we do).
 394                                  */
 395                                 ncp->nc_locktd = td;
 396                                 if (ncp->nc_vp)
 397                                         vhold(ncp->nc_vp);      /* MPSAFE */
 398                                 break;
 399                         }
 400                         /* cmpset failed */
 401                         continue;
 402                 }
 403                 if (ncp->nc_locktd == td) {
 404                         if (atomic_cmpset_int(&ncp->nc_exlocks, count,
 405                                               count + 1)) {
 406                                 break;
 407                         }
 408                         /* cmpset failed */
 409                         continue;
 410                 }
 411                 return(EWOULDBLOCK);
 412         }
 413         return(0);
 414 }
 415
 416 /*
 417  * Helper function
 418  *
 419  * NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
 420  *
 421  *       nc_locktd must be NULLed out prior to nc_exlocks getting cleared.
 422  *
 423  * MPSAFE
 424  */
 425 static
 426 void
 427 _cache_unlock(struct namecache *ncp)
 428 {
 429         thread_t td __debugvar = curthread;
 430         u_int count;
 431
 432         KKASSERT(ncp->nc_refs >= 0);
 433         KKASSERT(ncp->nc_exlocks > 0);
 434         KKASSERT(ncp->nc_locktd == td);
 435
 436         count = ncp->nc_exlocks;
 437         if ((count & ~NC_EXLOCK_REQ) == 1) {
 438                 ncp->nc_locktd = NULL;
 439                 if (ncp->nc_vp)
 440                         vdrop(ncp->nc_vp);
 441         }
 442         for (;;) {
 443                 if ((count & ~NC_EXLOCK_REQ) == 1) {
 444                         if (atomic_cmpset_int(&ncp->nc_exlocks, count, 0)) {
 445                                 if (count & NC_EXLOCK_REQ)
 446                                         wakeup(ncp);
 447                                 break;
 448                         }
 449                 } else {
 450                         if (atomic_cmpset_int(&ncp->nc_exlocks, count,
 451                                               count - 1)) {
 452                                 break;
 453                         }
 454                 }
 455                 count = ncp->nc_exlocks;
 456         }
 457 }
 458
 459
 460 /*
 461  * cache_hold() and cache_drop() prevent the premature deletion of a
 462  * namecache entry but do not prevent operations (such as zapping) on
 463  * that namecache entry.
 464  *
 465  * This routine may only be called from outside this source module if
 466  * nc_refs is already at least 1.
 467  *
 468  * This is a rare case where callers are allowed to hold a spinlock,
 469  * so we can't ourselves.
 470  *
 471  * MPSAFE
 472  */
 473 static __inline
 474 struct namecache *
 475 _cache_hold(struct namecache *ncp)
 476 {
 477         atomic_add_int(&ncp->nc_refs, 1);
 478         return(ncp);
 479 }
 480
 481 /*
 482  * Drop a cache entry, taking care to deal with races.
 483  *
 484  * For potential 1->0 transitions we must hold the ncp lock to safely
 485  * test its flags.  An unresolved entry with no children must be zapped
 486  * to avoid leaks.
 487  *
 488  * The call to cache_zap() itself will handle all remaining races and
 489  * will decrement the ncp's refs regardless.  If we are resolved or
 490  * have children nc_refs can safely be dropped to 0 without having to
 491  * zap the entry.
 492  *
 493  * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
 494  *
 495  * NOTE: cache_zap() may return a non-NULL referenced parent which must
 496  *       be dropped in a loop.
 497  *
 498  * MPSAFE
 499  */
 500 static __inline
 501 void
 502 _cache_drop(struct namecache *ncp)
 503 {
 504         int refs;
 505
 506         while (ncp) {
 507                 KKASSERT(ncp->nc_refs > 0);
 508                 refs = ncp->nc_refs;
 509
 510                 if (refs == 1) {
 511                         if (_cache_lock_nonblock(ncp) == 0) {
 512                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
 513                                 if ((ncp->nc_flag & NCF_UNRESOLVED) &&
 514                                     TAILQ_EMPTY(&ncp->nc_list)) {
 515                                         ncp = cache_zap(ncp, 1);
 516                                         continue;
 517                                 }
 518                                 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
 519                                         _cache_unlock(ncp);
 520                                         break;
 521                                 }
 522                                 _cache_unlock(ncp);
 523                         }
 524                 } else {
 525                         if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
 526                                 break;
 527                 }
 528                 cpu_pause();
 529         }
 530 }
 531
 532 /*
 533  * Link a new namecache entry to its parent and to the hash table.  Be
 534  * careful to avoid races if vhold() blocks in the future.
 535  *
 536  * Both ncp and par must be referenced and locked.
 537  *
 538  * NOTE: The hash table spinlock is likely held during this call, we
 539  *       can't do anything fancy.
 540  *
 541  * MPSAFE
 542  */
 543 static void
 544 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 545                    struct nchash_head *nchpp)
 546 {
 547         KKASSERT(ncp->nc_parent == NULL);
 548         ncp->nc_parent = par;
 549         ncp->nc_head = nchpp;
 550
 551         /*
 552          * Set inheritance flags.  Note that the parent flags may be
 553          * stale due to getattr potentially not having been run yet
 554          * (it gets run during nlookup()'s).
 555          */
 556         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 557         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 558                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 559         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 560                 ncp->nc_flag |= NCF_UF_PCACHE;
 561
 562         LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 563
 564         if (TAILQ_EMPTY(&par->nc_list)) {
 565                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 566                 /*
 567                  * Any vp associated with an ncp which has children must
 568                  * be held to prevent it from being recycled.
 569                  */
 570                 if (par->nc_vp)
 571                         vhold(par->nc_vp);
 572         } else {
 573                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 574         }
 575 }
 576
 577 /*
 578  * Remove the parent and hash associations from a namecache structure.
 579  * If this is the last child of the parent the cache_drop(par) will
 580  * attempt to recursively zap the parent.
 581  *
 582  * ncp must be locked.  This routine will acquire a temporary lock on
 583  * the parent as wlel as the appropriate hash chain.
 584  *
 585  * MPSAFE
 586  */
 587 static void
 588 _cache_unlink_parent(struct namecache *ncp)
 589 {
 590         struct namecache *par;
 591         struct vnode *dropvp;
 592
 593         if ((par = ncp->nc_parent) != NULL) {
 594                 KKASSERT(ncp->nc_parent == par);
 595                 _cache_hold(par);
 596                 _cache_lock(par);
 597                 spin_lock(&ncp->nc_head->spin);
 598                 LIST_REMOVE(ncp, nc_hash);
 599                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 600                 dropvp = NULL;
 601                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
 602                         dropvp = par->nc_vp;
 603                 spin_unlock(&ncp->nc_head->spin);
 604                 ncp->nc_parent = NULL;
 605                 ncp->nc_head = NULL;
 606                 _cache_unlock(par);
 607                 _cache_drop(par);
 608
 609                 /*
 610                  * We can only safely vdrop with no spinlocks held.
 611                  */
 612                 if (dropvp)
 613                         vdrop(dropvp);
 614         }
 615 }
 616
 617 /*
 618  * Allocate a new namecache structure.  Most of the code does not require
 619  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 620  *
 621  * MPSAFE
 622  */
 623 static struct namecache *
 624 cache_alloc(int nlen)
 625 {
 626         struct namecache *ncp;
 627
 628         ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 629         if (nlen)
 630                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
 631         ncp->nc_nlen = nlen;
 632         ncp->nc_flag = NCF_UNRESOLVED;
 633         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 634         ncp->nc_refs = 1;
 635
 636         TAILQ_INIT(&ncp->nc_list);
 637         _cache_lock(ncp);
 638         return(ncp);
 639 }
 640
 641 /*
 642  * Can only be called for the case where the ncp has never been
 643  * associated with anything (so no spinlocks are needed).
 644  *
 645  * MPSAFE
 646  */
 647 static void
 648 _cache_free(struct namecache *ncp)
 649 {
 650         KKASSERT(ncp->nc_refs == 1 && ncp->nc_exlocks == 1);
 651         if (ncp->nc_name)
 652                 kfree(ncp->nc_name, M_VFSCACHE);
 653         kfree(ncp, M_VFSCACHE);
 654 }
 655
 656 /*
 657  * MPSAFE
 658  */
 659 void
 660 cache_zero(struct nchandle *nch)
 661 {
 662         nch->ncp = NULL;
 663         nch->mount = NULL;
 664 }
 665
 666 /*
 667  * Ref and deref a namecache structure.
 668  *
 669  * The caller must specify a stable ncp pointer, typically meaning the
 670  * ncp is already referenced but this can also occur indirectly through
 671  * e.g. holding a lock on a direct child.
 672  *
 673  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
 674  *          use read spinlocks here.
 675  *
 676  * MPSAFE if nch is
 677  */
 678 struct nchandle *
 679 cache_hold(struct nchandle *nch)
 680 {
 681         _cache_hold(nch->ncp);
 682         atomic_add_int(&nch->mount->mnt_refs, 1);
 683         return(nch);
 684 }
 685
 686 /*
 687  * Create a copy of a namecache handle for an already-referenced
 688  * entry.
 689  *
 690  * MPSAFE if nch is
 691  */
 692 void
 693 cache_copy(struct nchandle *nch, struct nchandle *target)
 694 {
 695         *target = *nch;
 696         if (target->ncp)
 697                 _cache_hold(target->ncp);
 698         atomic_add_int(&nch->mount->mnt_refs, 1);
 699 }
 700
 701 /*
 702  * MPSAFE if nch is
 703  */
 704 void
 705 cache_changemount(struct nchandle *nch, struct mount *mp)
 706 {
 707         atomic_add_int(&nch->mount->mnt_refs, -1);
 708         nch->mount = mp;
 709         atomic_add_int(&nch->mount->mnt_refs, 1);
 710 }
 711
 712 /*
 713  * MPSAFE
 714  */
 715 void
 716 cache_drop(struct nchandle *nch)
 717 {
 718         atomic_add_int(&nch->mount->mnt_refs, -1);
 719         _cache_drop(nch->ncp);
 720         nch->ncp = NULL;
 721         nch->mount = NULL;
 722 }
 723
 724 /*
 725  * MPSAFE
 726  */
 727 void
 728 cache_lock(struct nchandle *nch)
 729 {
 730         _cache_lock(nch->ncp);
 731 }
 732
 733 /*
 734  * Relock nch1 given an unlocked nch1 and a locked nch2.  The caller
 735  * is responsible for checking both for validity on return as they
 736  * may have become invalid.
 737  *
 738  * We have to deal with potential deadlocks here, just ping pong
 739  * the lock until we get it (we will always block somewhere when
 740  * looping so this is not cpu-intensive).
 741  *
 742  * which = 0    nch1 not locked, nch2 is locked
 743  * which = 1    nch1 is locked, nch2 is not locked
 744  */
 745 void
 746 cache_relock(struct nchandle *nch1, struct ucred *cred1,
 747              struct nchandle *nch2, struct ucred *cred2)
 748 {
 749         int which;
 750
 751         which = 0;
 752
 753         for (;;) {
 754                 if (which == 0) {
 755                         if (cache_lock_nonblock(nch1) == 0) {
 756                                 cache_resolve(nch1, cred1);
 757                                 break;
 758                         }
 759                         cache_unlock(nch2);
 760                         cache_lock(nch1);
 761                         cache_resolve(nch1, cred1);
 762                         which = 1;
 763                 } else {
 764                         if (cache_lock_nonblock(nch2) == 0) {
 765                                 cache_resolve(nch2, cred2);
 766                                 break;
 767                         }
 768                         cache_unlock(nch1);
 769                         cache_lock(nch2);
 770                         cache_resolve(nch2, cred2);
 771                         which = 0;
 772                 }
 773         }
 774 }
 775
 776 /*
 777  * MPSAFE
 778  */
 779 int
 780 cache_lock_nonblock(struct nchandle *nch)
 781 {
 782         return(_cache_lock_nonblock(nch->ncp));
 783 }
 784
 785
 786 /*
 787  * MPSAFE
 788  */
 789 void
 790 cache_unlock(struct nchandle *nch)
 791 {
 792         _cache_unlock(nch->ncp);
 793 }
 794
 795 /*
 796  * ref-and-lock, unlock-and-deref functions.
 797  *
 798  * This function is primarily used by nlookup.  Even though cache_lock
 799  * holds the vnode, it is possible that the vnode may have already
 800  * initiated a recyclement.
 801  *
 802  * We want cache_get() to return a definitively usable vnode or a
 803  * definitively unresolved ncp.
 804  *
 805  * MPSAFE
 806  */
 807 static
 808 struct namecache *
 809 _cache_get(struct namecache *ncp)
 810 {
 811         _cache_hold(ncp);
 812         _cache_lock(ncp);
 813         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 814                 _cache_setunresolved(ncp);
 815         return(ncp);
 816 }
 817
 818 /*
 819  * This is a special form of _cache_lock() which only succeeds if
 820  * it can get a pristine, non-recursive lock.  The caller must have
 821  * already ref'd the ncp.
 822  *
 823  * On success the ncp will be locked, on failure it will not.  The
 824  * ref count does not change either way.
 825  *
 826  * We want _cache_lock_special() (on success) to return a definitively
 827  * usable vnode or a definitively unresolved ncp.
 828  *
 829  * MPSAFE
 830  */
 831 static int
 832 _cache_lock_special(struct namecache *ncp)
 833 {
 834         if (_cache_lock_nonblock(ncp) == 0) {
 835                 if ((ncp->nc_exlocks & ~NC_EXLOCK_REQ) == 1) {
 836                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 837                                 _cache_setunresolved(ncp);
 838                         return(0);
 839                 }
 840                 _cache_unlock(ncp);
 841         }
 842         return(EWOULDBLOCK);
 843 }
 844
 845
 846 /*
 847  * NOTE: The same nchandle can be passed for both arguments.
 848  *
 849  * MPSAFE
 850  */
 851 void
 852 cache_get(struct nchandle *nch, struct nchandle *target)
 853 {
 854         KKASSERT(nch->ncp->nc_refs > 0);
 855         target->mount = nch->mount;
 856         target->ncp = _cache_get(nch->ncp);
 857         atomic_add_int(&target->mount->mnt_refs, 1);
 858 }
 859
 860 /*
 861  * MPSAFE
 862  */
 863 static __inline
 864 void
 865 _cache_put(struct namecache *ncp)
 866 {
 867         _cache_unlock(ncp);
 868         _cache_drop(ncp);
 869 }
 870
 871 /*
 872  * MPSAFE
 873  */
 874 void
 875 cache_put(struct nchandle *nch)
 876 {
 877         atomic_add_int(&nch->mount->mnt_refs, -1);
 878         _cache_put(nch->ncp);
 879         nch->ncp = NULL;
 880         nch->mount = NULL;
 881 }
 882
 883 /*
 884  * Resolve an unresolved ncp by associating a vnode with it.  If the
 885  * vnode is NULL, a negative cache entry is created.
 886  *
 887  * The ncp should be locked on entry and will remain locked on return.
 888  *
 889  * MPSAFE
 890  */
 891 static
 892 void
 893 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
 894 {
 895         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
 896
 897         if (vp != NULL) {
 898                 /*
 899                  * Any vp associated with an ncp which has children must
 900                  * be held.  Any vp associated with a locked ncp must be held.
 901                  */
 902                 if (!TAILQ_EMPTY(&ncp->nc_list))
 903                         vhold(vp);
 904                 spin_lock(&vp->v_spin);
 905                 ncp->nc_vp = vp;
 906                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
 907                 spin_unlock(&vp->v_spin);
 908                 if (ncp->nc_exlocks)
 909                         vhold(vp);
 910
 911                 /*
 912                  * Set auxiliary flags
 913                  */
 914                 switch(vp->v_type) {
 915                 case VDIR:
 916                         ncp->nc_flag |= NCF_ISDIR;
 917                         break;
 918                 case VLNK:
 919                         ncp->nc_flag |= NCF_ISSYMLINK;
 920                         /* XXX cache the contents of the symlink */
 921                         break;
 922                 default:
 923                         break;
 924                 }
 925                 atomic_add_int(&numcache, 1);
 926                 ncp->nc_error = 0;
 927                 /* XXX: this is a hack to work-around the lack of a real pfs vfs
 928                  * implementation*/
 929                 if (mp != NULL)
 930                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
 931                                 vp->v_pfsmp = mp;
 932         } else {
 933                 /*
 934                  * When creating a negative cache hit we set the
 935                  * namecache_gen.  A later resolve will clean out the
 936                  * negative cache hit if the mount point's namecache_gen
 937                  * has changed.  Used by devfs, could also be used by
 938                  * other remote FSs.
 939                  */
 940                 ncp->nc_vp = NULL;
 941                 spin_lock(&ncspin);
 942                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
 943                 ++numneg;
 944                 spin_unlock(&ncspin);
 945                 ncp->nc_error = ENOENT;
 946                 if (mp)
 947                         VFS_NCPGEN_SET(mp, ncp);
 948         }
 949         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
 950 }
 951
 952 /*
 953  * MPSAFE
 954  */
 955 void
 956 cache_setvp(struct nchandle *nch, struct vnode *vp)
 957 {
 958         _cache_setvp(nch->mount, nch->ncp, vp);
 959 }
 960
 961 /*
 962  * MPSAFE
 963  */
 964 void
 965 cache_settimeout(struct nchandle *nch, int nticks)
 966 {
 967         struct namecache *ncp = nch->ncp;
 968
 969         if ((ncp->nc_timeout = ticks + nticks) == 0)
 970                 ncp->nc_timeout = 1;
 971 }
 972
 973 /*
 974  * Disassociate the vnode or negative-cache association and mark a
 975  * namecache entry as unresolved again.  Note that the ncp is still
 976  * left in the hash table and still linked to its parent.
 977  *
 978  * The ncp should be locked and refd on entry and will remain locked and refd
 979  * on return.
 980  *
 981  * This routine is normally never called on a directory containing children.
 982  * However, NFS often does just that in its rename() code as a cop-out to
 983  * avoid complex namespace operations.  This disconnects a directory vnode
 984  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
 985  * sync.
 986  *
 987  * MPSAFE
 988  */
 989 static
 990 void
 991 _cache_setunresolved(struct namecache *ncp)
 992 {
 993         struct vnode *vp;
 994
 995         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 996                 ncp->nc_flag |= NCF_UNRESOLVED;
 997                 ncp->nc_timeout = 0;
 998                 ncp->nc_error = ENOTCONN;
 999                 if ((vp = ncp->nc_vp) != NULL) {
1000                         atomic_add_int(&numcache, -1);
1001                         spin_lock(&vp->v_spin);
1002                         ncp->nc_vp = NULL;
1003                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1004                         spin_unlock(&vp->v_spin);
1005
1006                         /*
1007                          * Any vp associated with an ncp with children is
1008                          * held by that ncp.  Any vp associated with a locked
1009                          * ncp is held by that ncp.  These conditions must be
1010                          * undone when the vp is cleared out from the ncp.
1011                          */
1012                         if (!TAILQ_EMPTY(&ncp->nc_list))
1013                                 vdrop(vp);
1014                         if (ncp->nc_exlocks)
1015                                 vdrop(vp);
1016                 } else {
1017                         spin_lock(&ncspin);
1018                         TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
1019                         --numneg;
1020                         spin_unlock(&ncspin);
1021                 }
1022                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1023         }
1024 }
1025
1026 /*
1027  * The cache_nresolve() code calls this function to automatically
1028  * set a resolved cache element to unresolved if it has timed out
1029  * or if it is a negative cache hit and the mount point namecache_gen
1030  * has changed.
1031  *
1032  * MPSAFE
1033  */
1034 static __inline void
1035 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1036 {
1037         /*
1038          * Already in an unresolved state, nothing to do.
1039          */
1040         if (ncp->nc_flag & NCF_UNRESOLVED)
1041                 return;
1042
1043         /*
1044          * Try to zap entries that have timed out.  We have
1045          * to be careful here because locked leafs may depend
1046          * on the vnode remaining intact in a parent, so only
1047          * do this under very specific conditions.
1048          */
1049         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1050             TAILQ_EMPTY(&ncp->nc_list)) {
1051                 _cache_setunresolved(ncp);
1052                 return;
1053         }
1054
1055         /*
1056          * If a resolved negative cache hit is invalid due to
1057          * the mount's namecache generation being bumped, zap it.
1058          */
1059         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1060                 _cache_setunresolved(ncp);
1061                 return;
1062         }
1063 }
1064
1065 /*
1066  * MPSAFE
1067  */
1068 void
1069 cache_setunresolved(struct nchandle *nch)
1070 {
1071         _cache_setunresolved(nch->ncp);
1072 }
1073
1074 /*
1075  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1076  * looking for matches.  This flag tells the lookup code when it must
1077  * check for a mount linkage and also prevents the directories in question
1078  * from being deleted or renamed.
1079  *
1080  * MPSAFE
1081  */
1082 static
1083 int
1084 cache_clrmountpt_callback(struct mount *mp, void *data)
1085 {
1086         struct nchandle *nch = data;
1087
1088         if (mp->mnt_ncmounton.ncp == nch->ncp)
1089                 return(1);
1090         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1091                 return(1);
1092         return(0);
1093 }
1094
1095 /*
1096  * MPSAFE
1097  */
1098 void
1099 cache_clrmountpt(struct nchandle *nch)
1100 {
1101         int count;
1102
1103         count = mountlist_scan(cache_clrmountpt_callback, nch,
1104                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1105         if (count == 0)
1106                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1107 }
1108
1109 /*
1110  * Invalidate portions of the namecache topology given a starting entry.
1111  * The passed ncp is set to an unresolved state and:
1112  *
1113  * The passed ncp must be referencxed and locked.  The routine may unlock
1114  * and relock ncp several times, and will recheck the children and loop
1115  * to catch races.  When done the passed ncp will be returned with the
1116  * reference and lock intact.
1117  *
1118  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1119  *                        that the physical underlying nodes have been
1120  *                        destroyed... as in deleted.  For example, when
1121  *                        a directory is removed.  This will cause record
1122  *                        lookups on the name to no longer be able to find
1123  *                        the record and tells the resolver to return failure
1124  *                        rather then trying to resolve through the parent.
1125  *
1126  *                        The topology itself, including ncp->nc_name,
1127  *                        remains intact.
1128  *
1129  *                        This only applies to the passed ncp, if CINV_CHILDREN
1130  *                        is specified the children are not flagged.
1131  *
1132  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1133  *                        state as well.
1134  *
1135  *                        Note that this will also have the side effect of
1136  *                        cleaning out any unreferenced nodes in the topology
1137  *                        from the leaves up as the recursion backs out.
1138  *
1139  * Note that the topology for any referenced nodes remains intact, but
1140  * the nodes will be marked as having been destroyed and will be set
1141  * to an unresolved state.
1142  *
1143  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1144  * the namecache entry may not actually be invalidated on return if it was
1145  * revalidated while recursing down into its children.  This code guarentees
1146  * that the node(s) will go through an invalidation cycle, but does not
1147  * guarentee that they will remain in an invalidated state.
1148  *
1149  * Returns non-zero if a revalidation was detected during the invalidation
1150  * recursion, zero otherwise.  Note that since only the original ncp is
1151  * locked the revalidation ultimately can only indicate that the original ncp
1152  * *MIGHT* no have been reresolved.
1153  *
1154  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1155  * have to avoid blowing out the kernel stack.  We do this by saving the
1156  * deep namecache node and aborting the recursion, then re-recursing at that
1157  * node using a depth-first algorithm in order to allow multiple deep
1158  * recursions to chain through each other, then we restart the invalidation
1159  * from scratch.
1160  *
1161  * MPSAFE
1162  */
1163
1164 struct cinvtrack {
1165         struct namecache *resume_ncp;
1166         int depth;
1167 };
1168
1169 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1170
1171 static
1172 int
1173 _cache_inval(struct namecache *ncp, int flags)
1174 {
1175         struct cinvtrack track;
1176         struct namecache *ncp2;
1177         int r;
1178
1179         track.depth = 0;
1180         track.resume_ncp = NULL;
1181
1182         for (;;) {
1183                 r = _cache_inval_internal(ncp, flags, &track);
1184                 if (track.resume_ncp == NULL)
1185                         break;
1186                 kprintf("Warning: deep namecache recursion at %s\n",
1187                         ncp->nc_name);
1188                 _cache_unlock(ncp);
1189                 while ((ncp2 = track.resume_ncp) != NULL) {
1190                         track.resume_ncp = NULL;
1191                         _cache_lock(ncp2);
1192                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1193                                              &track);
1194                         _cache_put(ncp2);
1195                 }
1196                 _cache_lock(ncp);
1197         }
1198         return(r);
1199 }
1200
1201 int
1202 cache_inval(struct nchandle *nch, int flags)
1203 {
1204         return(_cache_inval(nch->ncp, flags));
1205 }
1206
1207 /*
1208  * Helper for _cache_inval().  The passed ncp is refd and locked and
1209  * remains that way on return, but may be unlocked/relocked multiple
1210  * times by the routine.
1211  */
1212 static int
1213 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1214 {
1215         struct namecache *kid;
1216         struct namecache *nextkid;
1217         int rcnt = 0;
1218
1219         KKASSERT(ncp->nc_exlocks);
1220
1221         _cache_setunresolved(ncp);
1222         if (flags & CINV_DESTROY)
1223                 ncp->nc_flag |= NCF_DESTROYED;
1224         if ((flags & CINV_CHILDREN) &&
1225             (kid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1226         ) {
1227                 _cache_hold(kid);
1228                 if (++track->depth > MAX_RECURSION_DEPTH) {
1229                         track->resume_ncp = ncp;
1230                         _cache_hold(ncp);
1231                         ++rcnt;
1232                 }
1233                 _cache_unlock(ncp);
1234                 while (kid) {
1235                         if (track->resume_ncp) {
1236                                 _cache_drop(kid);
1237                                 break;
1238                         }
1239                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1240                                 _cache_hold(nextkid);
1241                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1242                             TAILQ_FIRST(&kid->nc_list)
1243                         ) {
1244                                 _cache_lock(kid);
1245                                 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
1246                                 _cache_unlock(kid);
1247                         }
1248                         _cache_drop(kid);
1249                         kid = nextkid;
1250                 }
1251                 --track->depth;
1252                 _cache_lock(ncp);
1253         }
1254
1255         /*
1256          * Someone could have gotten in there while ncp was unlocked,
1257          * retry if so.
1258          */
1259         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1260                 ++rcnt;
1261         return (rcnt);
1262 }
1263
1264 /*
1265  * Invalidate a vnode's namecache associations.  To avoid races against
1266  * the resolver we do not invalidate a node which we previously invalidated
1267  * but which was then re-resolved while we were in the invalidation loop.
1268  *
1269  * Returns non-zero if any namecache entries remain after the invalidation
1270  * loop completed.
1271  *
1272  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1273  *       be ripped out of the topology while held, the vnode's v_namecache
1274  *       list has no such restriction.  NCP's can be ripped out of the list
1275  *       at virtually any time if not locked, even if held.
1276  *
1277  *       In addition, the v_namecache list itself must be locked via
1278  *       the vnode's spinlock.
1279  *
1280  * MPSAFE
1281  */
1282 int
1283 cache_inval_vp(struct vnode *vp, int flags)
1284 {
1285         struct namecache *ncp;
1286         struct namecache *next;
1287
1288 restart:
1289         spin_lock(&vp->v_spin);
1290         ncp = TAILQ_FIRST(&vp->v_namecache);
1291         if (ncp)
1292                 _cache_hold(ncp);
1293         while (ncp) {
1294                 /* loop entered with ncp held and vp spin-locked */
1295                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1296                         _cache_hold(next);
1297                 spin_unlock(&vp->v_spin);
1298                 _cache_lock(ncp);
1299                 if (ncp->nc_vp != vp) {
1300                         kprintf("Warning: cache_inval_vp: race-A detected on "
1301                                 "%s\n", ncp->nc_name);
1302                         _cache_put(ncp);
1303                         if (next)
1304                                 _cache_drop(next);
1305                         goto restart;
1306                 }
1307                 _cache_inval(ncp, flags);
1308                 _cache_put(ncp);                /* also releases reference */
1309                 ncp = next;
1310                 spin_lock(&vp->v_spin);
1311                 if (ncp && ncp->nc_vp != vp) {
1312                         spin_unlock(&vp->v_spin);
1313                         kprintf("Warning: cache_inval_vp: race-B detected on "
1314                                 "%s\n", ncp->nc_name);
1315                         _cache_drop(ncp);
1316                         goto restart;
1317                 }
1318         }
1319         spin_unlock(&vp->v_spin);
1320         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1321 }
1322
1323 /*
1324  * This routine is used instead of the normal cache_inval_vp() when we
1325  * are trying to recycle otherwise good vnodes.
1326  *
1327  * Return 0 on success, non-zero if not all namecache records could be
1328  * disassociated from the vnode (for various reasons).
1329  *
1330  * MPSAFE
1331  */
1332 int
1333 cache_inval_vp_nonblock(struct vnode *vp)
1334 {
1335         struct namecache *ncp;
1336         struct namecache *next;
1337
1338         spin_lock(&vp->v_spin);
1339         ncp = TAILQ_FIRST(&vp->v_namecache);
1340         if (ncp)
1341                 _cache_hold(ncp);
1342         while (ncp) {
1343                 /* loop entered with ncp held */
1344                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1345                         _cache_hold(next);
1346                 spin_unlock(&vp->v_spin);
1347                 if (_cache_lock_nonblock(ncp)) {
1348                         _cache_drop(ncp);
1349                         if (next)
1350                                 _cache_drop(next);
1351                         goto done;
1352                 }
1353                 if (ncp->nc_vp != vp) {
1354                         kprintf("Warning: cache_inval_vp: race-A detected on "
1355                                 "%s\n", ncp->nc_name);
1356                         _cache_put(ncp);
1357                         if (next)
1358                                 _cache_drop(next);
1359                         goto done;
1360                 }
1361                 _cache_inval(ncp, 0);
1362                 _cache_put(ncp);                /* also releases reference */
1363                 ncp = next;
1364                 spin_lock(&vp->v_spin);
1365                 if (ncp && ncp->nc_vp != vp) {
1366                         spin_unlock(&vp->v_spin);
1367                         kprintf("Warning: cache_inval_vp: race-B detected on "
1368                                 "%s\n", ncp->nc_name);
1369                         _cache_drop(ncp);
1370                         goto done;
1371                 }
1372         }
1373         spin_unlock(&vp->v_spin);
1374 done:
1375         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1376 }
1377
1378 /*
1379  * The source ncp has been renamed to the target ncp.  Both fncp and tncp
1380  * must be locked.  The target ncp is destroyed (as a normal rename-over
1381  * would destroy the target file or directory).
1382  *
1383  * Because there may be references to the source ncp we cannot copy its
1384  * contents to the target.  Instead the source ncp is relinked as the target
1385  * and the target ncp is removed from the namecache topology.
1386  *
1387  * MPSAFE
1388  */
1389 void
1390 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1391 {
1392         struct namecache *fncp = fnch->ncp;
1393         struct namecache *tncp = tnch->ncp;
1394         struct namecache *tncp_par;
1395         struct nchash_head *nchpp;
1396         u_int32_t hash;
1397         char *oname;
1398         char *nname;
1399
1400         if (tncp->nc_nlen) {
1401                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
1402                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
1403                 nname[tncp->nc_nlen] = 0;
1404         } else {
1405                 nname = NULL;
1406         }
1407
1408         /*
1409          * Rename fncp (unlink)
1410          */
1411         _cache_unlink_parent(fncp);
1412         oname = fncp->nc_name;
1413         fncp->nc_name = nname;
1414         fncp->nc_nlen = tncp->nc_nlen;
1415         if (oname)
1416                 kfree(oname, M_VFSCACHE);
1417
1418         tncp_par = tncp->nc_parent;
1419         _cache_hold(tncp_par);
1420         _cache_lock(tncp_par);
1421
1422         /*
1423          * Rename fncp (relink)
1424          */
1425         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1426         hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1427         nchpp = NCHHASH(hash);
1428
1429         spin_lock(&nchpp->spin);
1430         _cache_link_parent(fncp, tncp_par, nchpp);
1431         spin_unlock(&nchpp->spin);
1432
1433         _cache_put(tncp_par);
1434
1435         /*
1436          * Get rid of the overwritten tncp (unlink)
1437          */
1438         _cache_unlink(tncp);
1439 }
1440
1441 /*
1442  * Perform actions consistent with unlinking a file.  The passed-in ncp
1443  * must be locked.
1444  *
1445  * The ncp is marked DESTROYED so it no longer shows up in searches,
1446  * and will be physically deleted when the vnode goes away.
1447  *
1448  * If the related vnode has no refs then we cycle it through vget()/vput()
1449  * to (possibly if we don't have a ref race) trigger a deactivation,
1450  * allowing the VFS to trivially detect and recycle the deleted vnode
1451  * via VOP_INACTIVE().
1452  *
1453  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
1454  *       target ncp.
1455  */
1456 void
1457 cache_unlink(struct nchandle *nch)
1458 {
1459         _cache_unlink(nch->ncp);
1460 }
1461
1462 static void
1463 _cache_unlink(struct namecache *ncp)
1464 {
1465         struct vnode *vp;
1466
1467         /*
1468          * Causes lookups to fail and allows another ncp with the same
1469          * name to be created under ncp->nc_parent.
1470          */
1471         ncp->nc_flag |= NCF_DESTROYED;
1472
1473         /*
1474          * Attempt to trigger a deactivation.
1475          */
1476         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1477             (vp = ncp->nc_vp) != NULL &&
1478             !sysref_isactive(&vp->v_sysref)) {
1479                 if (vget(vp, LK_SHARED) == 0)
1480                         vput(vp);
1481         }
1482 }
1483
1484 /*
1485  * vget the vnode associated with the namecache entry.  Resolve the namecache
1486  * entry if necessary.  The passed ncp must be referenced and locked.
1487  *
1488  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
1489  * (depending on the passed lk_type) will be returned in *vpp with an error
1490  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
1491  * most typical error is ENOENT, meaning that the ncp represents a negative
1492  * cache hit and there is no vnode to retrieve, but other errors can occur
1493  * too.
1494  *
1495  * The vget() can race a reclaim.  If this occurs we re-resolve the
1496  * namecache entry.
1497  *
1498  * There are numerous places in the kernel where vget() is called on a
1499  * vnode while one or more of its namecache entries is locked.  Releasing
1500  * a vnode never deadlocks against locked namecache entries (the vnode
1501  * will not get recycled while referenced ncp's exist).  This means we
1502  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
1503  * lock when acquiring the vp lock or we might cause a deadlock.
1504  *
1505  * MPSAFE
1506  */
1507 int
1508 cache_vget(struct nchandle *nch, struct ucred *cred,
1509            int lk_type, struct vnode **vpp)
1510 {
1511         struct namecache *ncp;
1512         struct vnode *vp;
1513         int error;
1514
1515         ncp = nch->ncp;
1516         KKASSERT(ncp->nc_locktd == curthread);
1517 again:
1518         vp = NULL;
1519         if (ncp->nc_flag & NCF_UNRESOLVED)
1520                 error = cache_resolve(nch, cred);
1521         else
1522                 error = 0;
1523
1524         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1525                 error = vget(vp, lk_type);
1526                 if (error) {
1527                         /*
1528                          * VRECLAIM race
1529                          */
1530                         if (error == ENOENT) {
1531                                 kprintf("Warning: vnode reclaim race detected "
1532                                         "in cache_vget on %p (%s)\n",
1533                                         vp, ncp->nc_name);
1534                                 _cache_setunresolved(ncp);
1535                                 goto again;
1536                         }
1537
1538                         /*
1539                          * Not a reclaim race, some other error.
1540                          */
1541                         KKASSERT(ncp->nc_vp == vp);
1542                         vp = NULL;
1543                 } else {
1544                         KKASSERT(ncp->nc_vp == vp);
1545                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1546                 }
1547         }
1548         if (error == 0 && vp == NULL)
1549                 error = ENOENT;
1550         *vpp = vp;
1551         return(error);
1552 }
1553
1554 int
1555 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
1556 {
1557         struct namecache *ncp;
1558         struct vnode *vp;
1559         int error;
1560
1561         ncp = nch->ncp;
1562         KKASSERT(ncp->nc_locktd == curthread);
1563 again:
1564         vp = NULL;
1565         if (ncp->nc_flag & NCF_UNRESOLVED)
1566                 error = cache_resolve(nch, cred);
1567         else
1568                 error = 0;
1569
1570         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1571                 error = vget(vp, LK_SHARED);
1572                 if (error) {
1573                         /*
1574                          * VRECLAIM race
1575                          */
1576                         if (error == ENOENT) {
1577                                 kprintf("Warning: vnode reclaim race detected "
1578                                         "in cache_vget on %p (%s)\n",
1579                                         vp, ncp->nc_name);
1580                                 _cache_setunresolved(ncp);
1581                                 goto again;
1582                         }
1583
1584                         /*
1585                          * Not a reclaim race, some other error.
1586                          */
1587                         KKASSERT(ncp->nc_vp == vp);
1588                         vp = NULL;
1589                 } else {
1590                         KKASSERT(ncp->nc_vp == vp);
1591                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1592                         /* caller does not want a lock */
1593                         vn_unlock(vp);
1594                 }
1595         }
1596         if (error == 0 && vp == NULL)
1597                 error = ENOENT;
1598         *vpp = vp;
1599         return(error);
1600 }
1601
1602 /*
1603  * Return a referenced vnode representing the parent directory of
1604  * ncp.
1605  *
1606  * Because the caller has locked the ncp it should not be possible for
1607  * the parent ncp to go away.  However, the parent can unresolve its
1608  * dvp at any time so we must be able to acquire a lock on the parent
1609  * to safely access nc_vp.
1610  *
1611  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
1612  * so use vhold()/vdrop() while holding the lock to prevent dvp from
1613  * getting destroyed.
1614  *
1615  * MPSAFE - Note vhold() is allowed when dvp has 0 refs if we hold a
1616  *          lock on the ncp in question..
1617  */
1618 static struct vnode *
1619 cache_dvpref(struct namecache *ncp)
1620 {
1621         struct namecache *par;
1622         struct vnode *dvp;
1623
1624         dvp = NULL;
1625         if ((par = ncp->nc_parent) != NULL) {
1626                 _cache_hold(par);
1627                 _cache_lock(par);
1628                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
1629                         if ((dvp = par->nc_vp) != NULL)
1630                                 vhold(dvp);
1631                 }
1632                 _cache_unlock(par);
1633                 if (dvp) {
1634                         if (vget(dvp, LK_SHARED) == 0) {
1635                                 vn_unlock(dvp);
1636                                 vdrop(dvp);
1637                                 /* return refd, unlocked dvp */
1638                         } else {
1639                                 vdrop(dvp);
1640                                 dvp = NULL;
1641                         }
1642                 }
1643                 _cache_drop(par);
1644         }
1645         return(dvp);
1646 }
1647
1648 /*
1649  * Convert a directory vnode to a namecache record without any other
1650  * knowledge of the topology.  This ONLY works with directory vnodes and
1651  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
1652  * returned ncp (if not NULL) will be held and unlocked.
1653  *
1654  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
1655  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
1656  * for dvp.  This will fail only if the directory has been deleted out from
1657  * under the caller.
1658  *
1659  * Callers must always check for a NULL return no matter the value of 'makeit'.
1660  *
1661  * To avoid underflowing the kernel stack each recursive call increments
1662  * the makeit variable.
1663  */
1664
1665 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
1666                                   struct vnode *dvp, char *fakename);
1667 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
1668                                   struct vnode **saved_dvp);
1669
1670 int
1671 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
1672               struct nchandle *nch)
1673 {
1674         struct vnode *saved_dvp;
1675         struct vnode *pvp;
1676         char *fakename;
1677         int error;
1678
1679         nch->ncp = NULL;
1680         nch->mount = dvp->v_mount;
1681         saved_dvp = NULL;
1682         fakename = NULL;
1683
1684         /*
1685          * Handle the makeit == 0 degenerate case
1686          */
1687         if (makeit == 0) {
1688                 spin_lock(&dvp->v_spin);
1689                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
1690                 if (nch->ncp)
1691                         cache_hold(nch);
1692                 spin_unlock(&dvp->v_spin);
1693         }
1694
1695         /*
1696          * Loop until resolution, inside code will break out on error.
1697          */
1698         while (makeit) {
1699                 /*
1700                  * Break out if we successfully acquire a working ncp.
1701                  */
1702                 spin_lock(&dvp->v_spin);
1703                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
1704                 if (nch->ncp) {
1705                         cache_hold(nch);
1706                         spin_unlock(&dvp->v_spin);
1707                         break;
1708                 }
1709                 spin_unlock(&dvp->v_spin);
1710
1711                 /*
1712                  * If dvp is the root of its filesystem it should already
1713                  * have a namecache pointer associated with it as a side
1714                  * effect of the mount, but it may have been disassociated.
1715                  */
1716                 if (dvp->v_flag & VROOT) {
1717                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
1718                         error = cache_resolve_mp(nch->mount);
1719                         _cache_put(nch->ncp);
1720                         if (ncvp_debug) {
1721                                 kprintf("cache_fromdvp: resolve root of mount %p error %d",
1722                                         dvp->v_mount, error);
1723                         }
1724                         if (error) {
1725                                 if (ncvp_debug)
1726                                         kprintf(" failed\n");
1727                                 nch->ncp = NULL;
1728                                 break;
1729                         }
1730                         if (ncvp_debug)
1731                                 kprintf(" succeeded\n");
1732                         continue;
1733                 }
1734
1735                 /*
1736                  * If we are recursed too deeply resort to an O(n^2)
1737                  * algorithm to resolve the namecache topology.  The
1738                  * resolved pvp is left referenced in saved_dvp to
1739                  * prevent the tree from being destroyed while we loop.
1740                  */
1741                 if (makeit > 20) {
1742                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
1743                         if (error) {
1744                                 kprintf("lookupdotdot(longpath) failed %d "
1745                                        "dvp %p\n", error, dvp);
1746                                 nch->ncp = NULL;
1747                                 break;
1748                         }
1749                         continue;
1750                 }
1751
1752                 /*
1753                  * Get the parent directory and resolve its ncp.
1754                  */
1755                 if (fakename) {
1756                         kfree(fakename, M_TEMP);
1757                         fakename = NULL;
1758                 }
1759                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
1760                                           &fakename);
1761                 if (error) {
1762                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
1763                         break;
1764                 }
1765                 vn_unlock(pvp);
1766
1767                 /*
1768                  * Reuse makeit as a recursion depth counter.  On success
1769                  * nch will be fully referenced.
1770                  */
1771                 cache_fromdvp(pvp, cred, makeit + 1, nch);
1772                 vrele(pvp);
1773                 if (nch->ncp == NULL)
1774                         break;
1775
1776                 /*
1777                  * Do an inefficient scan of pvp (embodied by ncp) to look
1778                  * for dvp.  This will create a namecache record for dvp on
1779                  * success.  We loop up to recheck on success.
1780                  *
1781                  * ncp and dvp are both held but not locked.
1782                  */
1783                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
1784                 if (error) {
1785                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
1786                                 pvp, nch->ncp->nc_name, dvp);
1787                         cache_drop(nch);
1788                         /* nch was NULLed out, reload mount */
1789                         nch->mount = dvp->v_mount;
1790                         break;
1791                 }
1792                 if (ncvp_debug) {
1793                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
1794                                 pvp, nch->ncp->nc_name);
1795                 }
1796                 cache_drop(nch);
1797                 /* nch was NULLed out, reload mount */
1798                 nch->mount = dvp->v_mount;
1799         }
1800
1801         /*
1802          * If nch->ncp is non-NULL it will have been held already.
1803          */
1804         if (fakename)
1805                 kfree(fakename, M_TEMP);
1806         if (saved_dvp)
1807                 vrele(saved_dvp);
1808         if (nch->ncp)
1809                 return (0);
1810         return (EINVAL);
1811 }
1812
1813 /*
1814  * Go up the chain of parent directories until we find something
1815  * we can resolve into the namecache.  This is very inefficient.
1816  */
1817 static
1818 int
1819 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
1820                   struct vnode **saved_dvp)
1821 {
1822         struct nchandle nch;
1823         struct vnode *pvp;
1824         int error;
1825         static time_t last_fromdvp_report;
1826         char *fakename;
1827
1828         /*
1829          * Loop getting the parent directory vnode until we get something we
1830          * can resolve in the namecache.
1831          */
1832         vref(dvp);
1833         nch.mount = dvp->v_mount;
1834         nch.ncp = NULL;
1835         fakename = NULL;
1836
1837         for (;;) {
1838                 if (fakename) {
1839                         kfree(fakename, M_TEMP);
1840                         fakename = NULL;
1841                 }
1842                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
1843                                           &fakename);
1844                 if (error) {
1845                         vrele(dvp);
1846                         break;
1847                 }
1848                 vn_unlock(pvp);
1849                 spin_lock(&pvp->v_spin);
1850                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
1851                         _cache_hold(nch.ncp);
1852                         spin_unlock(&pvp->v_spin);
1853                         vrele(pvp);
1854                         break;
1855                 }
1856                 spin_unlock(&pvp->v_spin);
1857                 if (pvp->v_flag & VROOT) {
1858                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
1859                         error = cache_resolve_mp(nch.mount);
1860                         _cache_unlock(nch.ncp);
1861                         vrele(pvp);
1862                         if (error) {
1863                                 _cache_drop(nch.ncp);
1864                                 nch.ncp = NULL;
1865                                 vrele(dvp);
1866                         }
1867                         break;
1868                 }
1869                 vrele(dvp);
1870                 dvp = pvp;
1871         }
1872         if (error == 0) {
1873                 if (last_fromdvp_report != time_second) {
1874                         last_fromdvp_report = time_second;
1875                         kprintf("Warning: extremely inefficient path "
1876                                 "resolution on %s\n",
1877                                 nch.ncp->nc_name);
1878                 }
1879                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
1880
1881                 /*
1882                  * Hopefully dvp now has a namecache record associated with
1883                  * it.  Leave it referenced to prevent the kernel from
1884                  * recycling the vnode.  Otherwise extremely long directory
1885                  * paths could result in endless recycling.
1886                  */
1887                 if (*saved_dvp)
1888                     vrele(*saved_dvp);
1889                 *saved_dvp = dvp;
1890                 _cache_drop(nch.ncp);
1891         }
1892         if (fakename)
1893                 kfree(fakename, M_TEMP);
1894         return (error);
1895 }
1896
1897 /*
1898  * Do an inefficient scan of the directory represented by ncp looking for
1899  * the directory vnode dvp.  ncp must be held but not locked on entry and
1900  * will be held on return.  dvp must be refd but not locked on entry and
1901  * will remain refd on return.
1902  *
1903  * Why do this at all?  Well, due to its stateless nature the NFS server
1904  * converts file handles directly to vnodes without necessarily going through
1905  * the namecache ops that would otherwise create the namecache topology
1906  * leading to the vnode.  We could either (1) Change the namecache algorithms
1907  * to allow disconnect namecache records that are re-merged opportunistically,
1908  * or (2) Make the NFS server backtrack and scan to recover a connected
1909  * namecache topology in order to then be able to issue new API lookups.
1910  *
1911  * It turns out that (1) is a huge mess.  It takes a nice clean set of
1912  * namecache algorithms and introduces a lot of complication in every subsystem
1913  * that calls into the namecache to deal with the re-merge case, especially
1914  * since we are using the namecache to placehold negative lookups and the
1915  * vnode might not be immediately assigned. (2) is certainly far less
1916  * efficient then (1), but since we are only talking about directories here
1917  * (which are likely to remain cached), the case does not actually run all
1918  * that often and has the supreme advantage of not polluting the namecache
1919  * algorithms.
1920  *
1921  * If a fakename is supplied just construct a namecache entry using the
1922  * fake name.
1923  */
1924 static int
1925 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
1926                        struct vnode *dvp, char *fakename)
1927 {
1928         struct nlcomponent nlc;
1929         struct nchandle rncp;
1930         struct dirent *den;
1931         struct vnode *pvp;
1932         struct vattr vat;
1933         struct iovec iov;
1934         struct uio uio;
1935         int blksize;
1936         int eofflag;
1937         int bytes;
1938         char *rbuf;
1939         int error;
1940
1941         vat.va_blocksize = 0;
1942         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
1943                 return (error);
1944         cache_lock(nch);
1945         error = cache_vref(nch, cred, &pvp);
1946         cache_unlock(nch);
1947         if (error)
1948                 return (error);
1949         if (ncvp_debug) {
1950                 kprintf("inefficient_scan: directory iosize %ld "
1951                         "vattr fileid = %lld\n",
1952                         vat.va_blocksize,
1953                         (long long)vat.va_fileid);
1954         }
1955
1956         /*
1957          * Use the supplied fakename if not NULL.  Fake names are typically
1958          * not in the actual filesystem hierarchy.  This is used by HAMMER
1959          * to glue @@timestamp recursions together.
1960          */
1961         if (fakename) {
1962                 nlc.nlc_nameptr = fakename;
1963                 nlc.nlc_namelen = strlen(fakename);
1964                 rncp = cache_nlookup(nch, &nlc);
1965                 goto done;
1966         }
1967
1968         if ((blksize = vat.va_blocksize) == 0)
1969                 blksize = DEV_BSIZE;
1970         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
1971         rncp.ncp = NULL;
1972
1973         eofflag = 0;
1974         uio.uio_offset = 0;
1975 again:
1976         iov.iov_base = rbuf;
1977         iov.iov_len = blksize;
1978         uio.uio_iov = &iov;
1979         uio.uio_iovcnt = 1;
1980         uio.uio_resid = blksize;
1981         uio.uio_segflg = UIO_SYSSPACE;
1982         uio.uio_rw = UIO_READ;
1983         uio.uio_td = curthread;
1984
1985         if (ncvp_debug >= 2)
1986                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
1987         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
1988         if (error == 0) {
1989                 den = (struct dirent *)rbuf;
1990                 bytes = blksize - uio.uio_resid;
1991
1992                 while (bytes > 0) {
1993                         if (ncvp_debug >= 2) {
1994                                 kprintf("cache_inefficient_scan: %*.*s\n",
1995                                         den->d_namlen, den->d_namlen,
1996                                         den->d_name);
1997                         }
1998                         if (den->d_type != DT_WHT &&
1999                             den->d_ino == vat.va_fileid) {
2000                                 if (ncvp_debug) {
2001                                         kprintf("cache_inefficient_scan: "
2002                                                "MATCHED inode %lld path %s/%*.*s\n",
2003                                                (long long)vat.va_fileid,
2004                                                nch->ncp->nc_name,
2005                                                den->d_namlen, den->d_namlen,
2006                                                den->d_name);
2007                                 }
2008                                 nlc.nlc_nameptr = den->d_name;
2009                                 nlc.nlc_namelen = den->d_namlen;
2010                                 rncp = cache_nlookup(nch, &nlc);
2011                                 KKASSERT(rncp.ncp != NULL);
2012                                 break;
2013                         }
2014                         bytes -= _DIRENT_DIRSIZ(den);
2015                         den = _DIRENT_NEXT(den);
2016                 }
2017                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2018                         goto again;
2019         }
2020         kfree(rbuf, M_TEMP);
2021 done:
2022         vrele(pvp);
2023         if (rncp.ncp) {
2024                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2025                         _cache_setvp(rncp.mount, rncp.ncp, dvp);
2026                         if (ncvp_debug >= 2) {
2027                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2028                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2029                         }
2030                 } else {
2031                         if (ncvp_debug >= 2) {
2032                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2033                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2034                                         rncp.ncp->nc_vp);
2035                         }
2036                 }
2037                 if (rncp.ncp->nc_vp == NULL)
2038                         error = rncp.ncp->nc_error;
2039                 /*
2040                  * Release rncp after a successful nlookup.  rncp was fully
2041                  * referenced.
2042                  */
2043                 cache_put(&rncp);
2044         } else {
2045                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2046                         dvp, nch->ncp->nc_name);
2047                 error = ENOENT;
2048         }
2049         return (error);
2050 }
2051
2052 /*
2053  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2054  * state, which disassociates it from its vnode or ncneglist.
2055  *
2056  * Then, if there are no additional references to the ncp and no children,
2057  * the ncp is removed from the topology and destroyed.
2058  *
2059  * References and/or children may exist if the ncp is in the middle of the
2060  * topology, preventing the ncp from being destroyed.
2061  *
2062  * This function must be called with the ncp held and locked and will unlock
2063  * and drop it during zapping.
2064  *
2065  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2066  * This case can occur in the cache_drop() path.
2067  *
2068  * This function may returned a held (but NOT locked) parent node which the
2069  * caller must drop.  We do this so _cache_drop() can loop, to avoid
2070  * blowing out the kernel stack.
2071  *
2072  * WARNING!  For MPSAFE operation this routine must acquire up to three
2073  *           spin locks to be able to safely test nc_refs.  Lock order is
2074  *           very important.
2075  *
2076  *           hash spinlock if on hash list
2077  *           parent spinlock if child of parent
2078  *           (the ncp is unresolved so there is no vnode association)
2079  */
2080 static struct namecache *
2081 cache_zap(struct namecache *ncp, int nonblock)
2082 {
2083         struct namecache *par;
2084         struct vnode *dropvp;
2085         int refs;
2086
2087         /*
2088          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2089          */
2090         _cache_setunresolved(ncp);
2091
2092         /*
2093          * Try to scrap the entry and possibly tail-recurse on its parent.
2094          * We only scrap unref'd (other then our ref) unresolved entries,
2095          * we do not scrap 'live' entries.
2096          *
2097          * Note that once the spinlocks are acquired if nc_refs == 1 no
2098          * other references are possible.  If it isn't, however, we have
2099          * to decrement but also be sure to avoid a 1->0 transition.
2100          */
2101         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2102         KKASSERT(ncp->nc_refs > 0);
2103
2104         /*
2105          * Acquire locks.  Note that the parent can't go away while we hold
2106          * a child locked.
2107          */
2108         if ((par = ncp->nc_parent) != NULL) {
2109                 if (nonblock) {
2110                         for (;;) {
2111                                 if (_cache_lock_nonblock(par) == 0)
2112                                         break;
2113                                 refs = ncp->nc_refs;
2114                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2115                                 ++numdefered;   /* MP race ok */
2116                                 if (atomic_cmpset_int(&ncp->nc_refs,
2117                                                       refs, refs - 1)) {
2118                                         _cache_unlock(ncp);
2119                                         return(NULL);
2120                                 }
2121                                 cpu_pause();
2122                         }
2123                         _cache_hold(par);
2124                 } else {
2125                         _cache_hold(par);
2126                         _cache_lock(par);
2127                 }
2128                 spin_lock(&ncp->nc_head->spin);
2129         }
2130
2131         /*
2132          * If someone other then us has a ref or we have children
2133          * we cannot zap the entry.  The 1->0 transition and any
2134          * further list operation is protected by the spinlocks
2135          * we have acquired but other transitions are not.
2136          */
2137         for (;;) {
2138                 refs = ncp->nc_refs;
2139                 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
2140                         break;
2141                 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
2142                         if (par) {
2143                                 spin_unlock(&ncp->nc_head->spin);
2144                                 _cache_put(par);
2145                         }
2146                         _cache_unlock(ncp);
2147                         return(NULL);
2148                 }
2149                 cpu_pause();
2150         }
2151
2152         /*
2153          * We are the only ref and with the spinlocks held no further
2154          * refs can be acquired by others.
2155          *
2156          * Remove us from the hash list and parent list.  We have to
2157          * drop a ref on the parent's vp if the parent's list becomes
2158          * empty.
2159          */
2160         dropvp = NULL;
2161         if (par) {
2162                 struct nchash_head *nchpp = ncp->nc_head;
2163
2164                 KKASSERT(nchpp != NULL);
2165                 LIST_REMOVE(ncp, nc_hash);
2166                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2167                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
2168                         dropvp = par->nc_vp;
2169                 ncp->nc_head = NULL;
2170                 ncp->nc_parent = NULL;
2171                 spin_unlock(&nchpp->spin);
2172                 _cache_unlock(par);
2173         } else {
2174                 KKASSERT(ncp->nc_head == NULL);
2175         }
2176
2177         /*
2178          * ncp should not have picked up any refs.  Physically
2179          * destroy the ncp.
2180          */
2181         KKASSERT(ncp->nc_refs == 1);
2182         /* _cache_unlock(ncp) not required */
2183         ncp->nc_refs = -1;      /* safety */
2184         if (ncp->nc_name)
2185                 kfree(ncp->nc_name, M_VFSCACHE);
2186         kfree(ncp, M_VFSCACHE);
2187
2188         /*
2189          * Delayed drop (we had to release our spinlocks)
2190          *
2191          * The refed parent (if not  NULL) must be dropped.  The
2192          * caller is responsible for looping.
2193          */
2194         if (dropvp)
2195                 vdrop(dropvp);
2196         return(par);
2197 }
2198
2199 /*
2200  * Clean up dangling negative cache and defered-drop entries in the
2201  * namecache.
2202  */
2203 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2204
2205 static cache_hs_t neg_cache_hysteresis_state = CHI_LOW;
2206 static cache_hs_t pos_cache_hysteresis_state = CHI_LOW;
2207
2208 void
2209 cache_hysteresis(void)
2210 {
2211         int poslimit;
2212
2213         /*
2214          * Don't cache too many negative hits.  We use hysteresis to reduce
2215          * the impact on the critical path.
2216          */
2217         switch(neg_cache_hysteresis_state) {
2218         case CHI_LOW:
2219                 if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
2220                         _cache_cleanneg(10);
2221                         neg_cache_hysteresis_state = CHI_HIGH;
2222                 }
2223                 break;
2224         case CHI_HIGH:
2225                 if (numneg > MINNEG * 9 / 10 &&
2226                     numneg * ncnegfactor * 9 / 10 > numcache
2227                 ) {
2228                         _cache_cleanneg(10);
2229                 } else {
2230                         neg_cache_hysteresis_state = CHI_LOW;
2231                 }
2232                 break;
2233         }
2234
2235         /*
2236          * Don't cache too many positive hits.  We use hysteresis to reduce
2237          * the impact on the critical path.
2238          *
2239          * Excessive positive hits can accumulate due to large numbers of
2240          * hardlinks (the vnode cache will not prevent hl ncps from growing
2241          * into infinity).
2242          */
2243         if ((poslimit = ncposlimit) == 0)
2244                 poslimit = desiredvnodes * 2;
2245
2246         switch(pos_cache_hysteresis_state) {
2247         case CHI_LOW:
2248                 if (numcache > poslimit && numcache > MINPOS) {
2249                         _cache_cleanpos(10);
2250                         pos_cache_hysteresis_state = CHI_HIGH;
2251                 }
2252                 break;
2253         case CHI_HIGH:
2254                 if (numcache > poslimit * 5 / 6 && numcache > MINPOS) {
2255                         _cache_cleanpos(10);
2256                 } else {
2257                         pos_cache_hysteresis_state = CHI_LOW;
2258                 }
2259                 break;
2260         }
2261
2262         /*
2263          * Clean out dangling defered-zap ncps which could not
2264          * be cleanly dropped if too many build up.  Note
2265          * that numdefered is not an exact number as such ncps
2266          * can be reused and the counter is not handled in a MP
2267          * safe manner by design.
2268          */
2269         if (numdefered * ncnegfactor > numcache) {
2270                 _cache_cleandefered();
2271         }
2272 }
2273
2274 /*
2275  * NEW NAMECACHE LOOKUP API
2276  *
2277  * Lookup an entry in the namecache.  The passed par_nch must be referenced
2278  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
2279  * is ALWAYS returned, eve if the supplied component is illegal.
2280  *
2281  * The resulting namecache entry should be returned to the system with
2282  * cache_put() or cache_unlock() + cache_drop().
2283  *
2284  * namecache locks are recursive but care must be taken to avoid lock order
2285  * reversals (hence why the passed par_nch must be unlocked).  Locking
2286  * rules are to order for parent traversals, not for child traversals.
2287  *
2288  * Nobody else will be able to manipulate the associated namespace (e.g.
2289  * create, delete, rename, rename-target) until the caller unlocks the
2290  * entry.
2291  *
2292  * The returned entry will be in one of three states:  positive hit (non-null
2293  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2294  * Unresolved entries must be resolved through the filesystem to associate the
2295  * vnode and/or determine whether a positive or negative hit has occured.
2296  *
2297  * It is not necessary to lock a directory in order to lock namespace under
2298  * that directory.  In fact, it is explicitly not allowed to do that.  A
2299  * directory is typically only locked when being created, renamed, or
2300  * destroyed.
2301  *
2302  * The directory (par) may be unresolved, in which case any returned child
2303  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
2304  * the filesystem lookup requires a resolved directory vnode the caller is
2305  * responsible for resolving the namecache chain top-down.  This API
2306  * specifically allows whole chains to be created in an unresolved state.
2307  */
2308 struct nchandle
2309 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
2310 {
2311         struct nchandle nch;
2312         struct namecache *ncp;
2313         struct namecache *new_ncp;
2314         struct nchash_head *nchpp;
2315         struct mount *mp;
2316         u_int32_t hash;
2317         globaldata_t gd;
2318         int par_locked;
2319
2320         numcalls++;
2321         gd = mycpu;
2322         mp = par_nch->mount;
2323         par_locked = 0;
2324
2325         /*
2326          * This is a good time to call it, no ncp's are locked by
2327          * the caller or us.
2328          */
2329         cache_hysteresis();
2330
2331         /*
2332          * Try to locate an existing entry
2333          */
2334         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2335         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2336         new_ncp = NULL;
2337         nchpp = NCHHASH(hash);
2338 restart:
2339         spin_lock(&nchpp->spin);
2340         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2341                 numchecks++;
2342
2343                 /*
2344                  * Break out if we find a matching entry.  Note that
2345                  * UNRESOLVED entries may match, but DESTROYED entries
2346                  * do not.
2347                  */
2348                 if (ncp->nc_parent == par_nch->ncp &&
2349                     ncp->nc_nlen == nlc->nlc_namelen &&
2350                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2351                     (ncp->nc_flag & NCF_DESTROYED) == 0
2352                 ) {
2353                         _cache_hold(ncp);
2354                         spin_unlock(&nchpp->spin);
2355                         if (par_locked) {
2356                                 _cache_unlock(par_nch->ncp);
2357                                 par_locked = 0;
2358                         }
2359                         if (_cache_lock_special(ncp) == 0) {
2360                                 _cache_auto_unresolve(mp, ncp);
2361                                 if (new_ncp)
2362                                         _cache_free(new_ncp);
2363                                 goto found;
2364                         }
2365                         _cache_get(ncp);
2366                         _cache_put(ncp);
2367                         _cache_drop(ncp);
2368                         goto restart;
2369                 }
2370         }
2371
2372         /*
2373          * We failed to locate an entry, create a new entry and add it to
2374          * the cache.  The parent ncp must also be locked so we
2375          * can link into it.
2376          *
2377          * We have to relookup after possibly blocking in kmalloc or
2378          * when locking par_nch.
2379          *
2380          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2381          *       mount case, in which case nc_name will be NULL.
2382          */
2383         if (new_ncp == NULL) {
2384                 spin_unlock(&nchpp->spin);
2385                 new_ncp = cache_alloc(nlc->nlc_namelen);
2386                 if (nlc->nlc_namelen) {
2387                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2388                               nlc->nlc_namelen);
2389                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
2390                 }
2391                 goto restart;
2392         }
2393         if (par_locked == 0) {
2394                 spin_unlock(&nchpp->spin);
2395                 _cache_lock(par_nch->ncp);
2396                 par_locked = 1;
2397                 goto restart;
2398         }
2399
2400         /*
2401          * WARNING!  We still hold the spinlock.  We have to set the hash
2402          *           table entry atomically.
2403          */
2404         ncp = new_ncp;
2405         _cache_link_parent(ncp, par_nch->ncp, nchpp);
2406         spin_unlock(&nchpp->spin);
2407         _cache_unlock(par_nch->ncp);
2408         /* par_locked = 0 - not used */
2409 found:
2410         /*
2411          * stats and namecache size management
2412          */
2413         if (ncp->nc_flag & NCF_UNRESOLVED)
2414                 ++gd->gd_nchstats->ncs_miss;
2415         else if (ncp->nc_vp)
2416                 ++gd->gd_nchstats->ncs_goodhits;
2417         else
2418                 ++gd->gd_nchstats->ncs_neghits;
2419         nch.mount = mp;
2420         nch.ncp = ncp;
2421         atomic_add_int(&nch.mount->mnt_refs, 1);
2422         return(nch);
2423 }
2424
2425 /*
2426  * This is a non-blocking verison of cache_nlookup() used by
2427  * nfs_readdirplusrpc_uio().  It can fail for any reason and
2428  * will return nch.ncp == NULL in that case.
2429  */
2430 struct nchandle
2431 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
2432 {
2433         struct nchandle nch;
2434         struct namecache *ncp;
2435         struct namecache *new_ncp;
2436         struct nchash_head *nchpp;
2437         struct mount *mp;
2438         u_int32_t hash;
2439         globaldata_t gd;
2440         int par_locked;
2441
2442         numcalls++;
2443         gd = mycpu;
2444         mp = par_nch->mount;
2445         par_locked = 0;
2446
2447         /*
2448          * Try to locate an existing entry
2449          */
2450         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2451         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2452         new_ncp = NULL;
2453         nchpp = NCHHASH(hash);
2454 restart:
2455         spin_lock(&nchpp->spin);
2456         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2457                 numchecks++;
2458
2459                 /*
2460                  * Break out if we find a matching entry.  Note that
2461                  * UNRESOLVED entries may match, but DESTROYED entries
2462                  * do not.
2463                  */
2464                 if (ncp->nc_parent == par_nch->ncp &&
2465                     ncp->nc_nlen == nlc->nlc_namelen &&
2466                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2467                     (ncp->nc_flag & NCF_DESTROYED) == 0
2468                 ) {
2469                         _cache_hold(ncp);
2470                         spin_unlock(&nchpp->spin);
2471                         if (par_locked) {
2472                                 _cache_unlock(par_nch->ncp);
2473                                 par_locked = 0;
2474                         }
2475                         if (_cache_lock_special(ncp) == 0) {
2476                                 _cache_auto_unresolve(mp, ncp);
2477                                 if (new_ncp) {
2478                                         _cache_free(new_ncp);
2479                                         new_ncp = NULL;
2480                                 }
2481                                 goto found;
2482                         }
2483                         _cache_drop(ncp);
2484                         goto failed;
2485                 }
2486         }
2487
2488         /*
2489          * We failed to locate an entry, create a new entry and add it to
2490          * the cache.  The parent ncp must also be locked so we
2491          * can link into it.
2492          *
2493          * We have to relookup after possibly blocking in kmalloc or
2494          * when locking par_nch.
2495          *
2496          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2497          *       mount case, in which case nc_name will be NULL.
2498          */
2499         if (new_ncp == NULL) {
2500                 spin_unlock(&nchpp->spin);
2501                 new_ncp = cache_alloc(nlc->nlc_namelen);
2502                 if (nlc->nlc_namelen) {
2503                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2504                               nlc->nlc_namelen);
2505                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
2506                 }
2507                 goto restart;
2508         }
2509         if (par_locked == 0) {
2510                 spin_unlock(&nchpp->spin);
2511                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
2512                         par_locked = 1;
2513                         goto restart;
2514                 }
2515                 goto failed;
2516         }
2517
2518         /*
2519          * WARNING!  We still hold the spinlock.  We have to set the hash
2520          *           table entry atomically.
2521          */
2522         ncp = new_ncp;
2523         _cache_link_parent(ncp, par_nch->ncp, nchpp);
2524         spin_unlock(&nchpp->spin);
2525         _cache_unlock(par_nch->ncp);
2526         /* par_locked = 0 - not used */
2527 found:
2528         /*
2529          * stats and namecache size management
2530          */
2531         if (ncp->nc_flag & NCF_UNRESOLVED)
2532                 ++gd->gd_nchstats->ncs_miss;
2533         else if (ncp->nc_vp)
2534                 ++gd->gd_nchstats->ncs_goodhits;
2535         else
2536                 ++gd->gd_nchstats->ncs_neghits;
2537         nch.mount = mp;
2538         nch.ncp = ncp;
2539         atomic_add_int(&nch.mount->mnt_refs, 1);
2540         return(nch);
2541 failed:
2542         if (new_ncp) {
2543                 _cache_free(new_ncp);
2544                 new_ncp = NULL;
2545         }
2546         nch.mount = NULL;
2547         nch.ncp = NULL;
2548         return(nch);
2549 }
2550
2551 /*
2552  * The namecache entry is marked as being used as a mount point.
2553  * Locate the mount if it is visible to the caller.  The DragonFly
2554  * mount system allows arbitrary loops in the topology and disentangles
2555  * those loops by matching against (mp, ncp) rather than just (ncp).
2556  * This means any given ncp can dive any number of mounts, depending
2557  * on the relative mount (e.g. nullfs) the caller is at in the topology.
2558  *
2559  * We use a very simple frontend cache to reduce SMP conflicts,
2560  * which we have to do because the mountlist scan needs an exclusive
2561  * lock around its ripout info list.  Not to mention that there might
2562  * be a lot of mounts.
2563  */
2564 struct findmount_info {
2565         struct mount *result;
2566         struct mount *nch_mount;
2567         struct namecache *nch_ncp;
2568 };
2569
2570 static
2571 struct ncmount_cache *
2572 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
2573 {
2574         int hash;
2575
2576         hash = ((int)(intptr_t)mp / sizeof(*mp)) ^
2577                ((int)(intptr_t)ncp / sizeof(*ncp));
2578         hash = (hash & 0x7FFFFFFF) % NCMOUNT_NUMCACHE;
2579         return (&ncmount_cache[hash]);
2580 }
2581
2582 static
2583 int
2584 cache_findmount_callback(struct mount *mp, void *data)
2585 {
2586         struct findmount_info *info = data;
2587
2588         /*
2589          * Check the mount's mounted-on point against the passed nch.
2590          */
2591         if (mp->mnt_ncmounton.mount == info->nch_mount &&
2592             mp->mnt_ncmounton.ncp == info->nch_ncp
2593         ) {
2594             info->result = mp;
2595             atomic_add_int(&mp->mnt_refs, 1);
2596             return(-1);
2597         }
2598         return(0);
2599 }
2600
2601 struct mount *
2602 cache_findmount(struct nchandle *nch)
2603 {
2604         struct findmount_info info;
2605         struct ncmount_cache *ncc;
2606         struct mount *mp;
2607
2608         /*
2609          * Fast
2610          */
2611         if (ncmount_cache_enable == 0) {
2612                 ncc = NULL;
2613                 goto skip;
2614         }
2615         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
2616         if (ncc->ncp == nch->ncp) {
2617                 spin_lock_shared(&ncc->spin);
2618                 if (ncc->isneg == 0 &&
2619                     ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) {
2620                         if (mp->mnt_ncmounton.mount == nch->mount &&
2621                             mp->mnt_ncmounton.ncp == nch->ncp) {
2622                                 /*
2623                                  * Cache hit (positive)
2624                                  */
2625                                 atomic_add_int(&mp->mnt_refs, 1);
2626                                 spin_unlock_shared(&ncc->spin);
2627                                 ++ncmount_cache_hit;
2628                                 return(mp);
2629                         }
2630                         /* else cache miss */
2631                 }
2632                 if (ncc->isneg &&
2633                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
2634                         /*
2635                          * Cache hit (negative)
2636                          */
2637                         spin_unlock_shared(&ncc->spin);
2638                         ++ncmount_cache_hit;
2639                         return(NULL);
2640                 }
2641                 spin_unlock_shared(&ncc->spin);
2642         }
2643 skip:
2644
2645         /*
2646          * Slow
2647          */
2648         info.result = NULL;
2649         info.nch_mount = nch->mount;
2650         info.nch_ncp = nch->ncp;
2651         mountlist_scan(cache_findmount_callback, &info,
2652                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
2653
2654         /*
2655          * Cache the result.
2656          *
2657          * Negative lookups: We cache the originating {ncp,mp}. (mp) is
2658          *                   only used for pointer comparisons and is not
2659          *                   referenced (otherwise there would be dangling
2660          *                   refs).
2661          *
2662          * Positive lookups: We cache the originating {ncp} and the target
2663          *                   (mp).  (mp) is referenced.
2664          *
2665          * Indeterminant:    If the match is undergoing an unmount we do
2666          *                   not cache it to avoid racing cache_unmounting(),
2667          *                   but still return the match.
2668          */
2669         if (ncc) {
2670                 spin_lock(&ncc->spin);
2671                 if (info.result == NULL) {
2672                         if (ncc->isneg == 0 && ncc->mp)
2673                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
2674                         ncc->ncp = nch->ncp;
2675                         ncc->mp = nch->mount;
2676                         ncc->isneg = 1;
2677                         spin_unlock(&ncc->spin);
2678                         ++ncmount_cache_overwrite;
2679                 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) {
2680                         if (ncc->isneg == 0 && ncc->mp)
2681                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
2682                         atomic_add_int(&info.result->mnt_refs, 1);
2683                         ncc->ncp = nch->ncp;
2684                         ncc->mp = info.result;
2685                         ncc->isneg = 0;
2686                         spin_unlock(&ncc->spin);
2687                         ++ncmount_cache_overwrite;
2688                 } else {
2689                         spin_unlock(&ncc->spin);
2690                 }
2691                 ++ncmount_cache_miss;
2692         }
2693         return(info.result);
2694 }
2695
2696 void
2697 cache_dropmount(struct mount *mp)
2698 {
2699         atomic_add_int(&mp->mnt_refs, -1);
2700 }
2701
2702 void
2703 cache_ismounting(struct mount *mp)
2704 {
2705         struct nchandle *nch = &mp->mnt_ncmounton;
2706         struct ncmount_cache *ncc;
2707
2708         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
2709         if (ncc->isneg &&
2710             ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
2711                 spin_lock(&ncc->spin);
2712                 if (ncc->isneg &&
2713                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
2714                         ncc->ncp = NULL;
2715                         ncc->mp = NULL;
2716                 }
2717                 spin_unlock(&ncc->spin);
2718         }
2719 }
2720
2721 void
2722 cache_unmounting(struct mount *mp)
2723 {
2724         struct nchandle *nch = &mp->mnt_ncmounton;
2725         struct ncmount_cache *ncc;
2726
2727         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
2728         if (ncc->isneg == 0 &&
2729             ncc->ncp == nch->ncp && ncc->mp == mp) {
2730                 spin_lock(&ncc->spin);
2731                 if (ncc->isneg == 0 &&
2732                     ncc->ncp == nch->ncp && ncc->mp == mp) {
2733                         atomic_add_int(&mp->mnt_refs, -1);
2734                         ncc->ncp = NULL;
2735                         ncc->mp = NULL;
2736                 }
2737                 spin_unlock(&ncc->spin);
2738         }
2739 }
2740
2741 /*
2742  * Resolve an unresolved namecache entry, generally by looking it up.
2743  * The passed ncp must be locked and refd.
2744  *
2745  * Theoretically since a vnode cannot be recycled while held, and since
2746  * the nc_parent chain holds its vnode as long as children exist, the
2747  * direct parent of the cache entry we are trying to resolve should
2748  * have a valid vnode.  If not then generate an error that we can
2749  * determine is related to a resolver bug.
2750  *
2751  * However, if a vnode was in the middle of a recyclement when the NCP
2752  * got locked, ncp->nc_vp might point to a vnode that is about to become
2753  * invalid.  cache_resolve() handles this case by unresolving the entry
2754  * and then re-resolving it.
2755  *
2756  * Note that successful resolution does not necessarily return an error
2757  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
2758  * will be returned.
2759  *
2760  * MPSAFE
2761  */
2762 int
2763 cache_resolve(struct nchandle *nch, struct ucred *cred)
2764 {
2765         struct namecache *par_tmp;
2766         struct namecache *par;
2767         struct namecache *ncp;
2768         struct nchandle nctmp;
2769         struct mount *mp;
2770         struct vnode *dvp;
2771         int error;
2772
2773         ncp = nch->ncp;
2774         mp = nch->mount;
2775 restart:
2776         /*
2777          * If the ncp is already resolved we have nothing to do.  However,
2778          * we do want to guarentee that a usable vnode is returned when
2779          * a vnode is present, so make sure it hasn't been reclaimed.
2780          */
2781         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
2782                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
2783                         _cache_setunresolved(ncp);
2784                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
2785                         return (ncp->nc_error);
2786         }
2787
2788         /*
2789          * If the ncp was destroyed it will never resolve again.  This
2790          * can basically only happen when someone is chdir'd into an
2791          * empty directory which is then rmdir'd.  We want to catch this
2792          * here and not dive the VFS because the VFS might actually
2793          * have a way to re-resolve the disconnected ncp, which will
2794          * result in inconsistencies in the cdir/nch for proc->p_fd.
2795          */
2796         if (ncp->nc_flag & NCF_DESTROYED) {
2797                 kprintf("Warning: cache_resolve: ncp '%s' was unlinked\n",
2798                         ncp->nc_name);
2799                 return(EINVAL);
2800         }
2801
2802         /*
2803          * Mount points need special handling because the parent does not
2804          * belong to the same filesystem as the ncp.
2805          */
2806         if (ncp == mp->mnt_ncmountpt.ncp)
2807                 return (cache_resolve_mp(mp));
2808
2809         /*
2810          * We expect an unbroken chain of ncps to at least the mount point,
2811          * and even all the way to root (but this code doesn't have to go
2812          * past the mount point).
2813          */
2814         if (ncp->nc_parent == NULL) {
2815                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
2816                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
2817                 ncp->nc_error = EXDEV;
2818                 return(ncp->nc_error);
2819         }
2820
2821         /*
2822          * The vp's of the parent directories in the chain are held via vhold()
2823          * due to the existance of the child, and should not disappear.
2824          * However, there are cases where they can disappear:
2825          *
2826          *      - due to filesystem I/O errors.
2827          *      - due to NFS being stupid about tracking the namespace and
2828          *        destroys the namespace for entire directories quite often.
2829          *      - due to forced unmounts.
2830          *      - due to an rmdir (parent will be marked DESTROYED)
2831          *
2832          * When this occurs we have to track the chain backwards and resolve
2833          * it, looping until the resolver catches up to the current node.  We
2834          * could recurse here but we might run ourselves out of kernel stack
2835          * so we do it in a more painful manner.  This situation really should
2836          * not occur all that often, or if it does not have to go back too
2837          * many nodes to resolve the ncp.
2838          */
2839         while ((dvp = cache_dvpref(ncp)) == NULL) {
2840                 /*
2841                  * This case can occur if a process is CD'd into a
2842                  * directory which is then rmdir'd.  If the parent is marked
2843                  * destroyed there is no point trying to resolve it.
2844                  */
2845                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
2846                         return(ENOENT);
2847                 par = ncp->nc_parent;
2848                 _cache_hold(par);
2849                 _cache_lock(par);
2850                 while ((par_tmp = par->nc_parent) != NULL &&
2851                        par_tmp->nc_vp == NULL) {
2852                         _cache_hold(par_tmp);
2853                         _cache_lock(par_tmp);
2854                         _cache_put(par);
2855                         par = par_tmp;
2856                 }
2857                 if (par->nc_parent == NULL) {
2858                         kprintf("EXDEV case 2 %*.*s\n",
2859                                 par->nc_nlen, par->nc_nlen, par->nc_name);
2860                         _cache_put(par);
2861                         return (EXDEV);
2862                 }
2863                 kprintf("[diagnostic] cache_resolve: had to recurse on %*.*s\n",
2864                         par->nc_nlen, par->nc_nlen, par->nc_name);
2865                 /*
2866                  * The parent is not set in stone, ref and lock it to prevent
2867                  * it from disappearing.  Also note that due to renames it
2868                  * is possible for our ncp to move and for par to no longer
2869                  * be one of its parents.  We resolve it anyway, the loop
2870                  * will handle any moves.
2871                  */
2872                 _cache_get(par);        /* additional hold/lock */
2873                 _cache_put(par);        /* from earlier hold/lock */
2874                 if (par == nch->mount->mnt_ncmountpt.ncp) {
2875                         cache_resolve_mp(nch->mount);
2876                 } else if ((dvp = cache_dvpref(par)) == NULL) {
2877                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
2878                         _cache_put(par);
2879                         continue;
2880                 } else {
2881                         if (par->nc_flag & NCF_UNRESOLVED) {
2882                                 nctmp.mount = mp;
2883                                 nctmp.ncp = par;
2884                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
2885                         }
2886                         vrele(dvp);
2887                 }
2888                 if ((error = par->nc_error) != 0) {
2889                         if (par->nc_error != EAGAIN) {
2890                                 kprintf("EXDEV case 3 %*.*s error %d\n",
2891                                     par->nc_nlen, par->nc_nlen, par->nc_name,
2892                                     par->nc_error);
2893                                 _cache_put(par);
2894                                 return(error);
2895                         }
2896                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
2897                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
2898                 }
2899                 _cache_put(par);
2900                 /* loop */
2901         }
2902
2903         /*
2904          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
2905          * ncp's and reattach them.  If this occurs the original ncp is marked
2906          * EAGAIN to force a relookup.
2907          *
2908          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
2909          * ncp must already be resolved.
2910          */
2911         if (dvp) {
2912                 nctmp.mount = mp;
2913                 nctmp.ncp = ncp;
2914                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
2915                 vrele(dvp);
2916         } else {
2917                 ncp->nc_error = EPERM;
2918         }
2919         if (ncp->nc_error == EAGAIN) {
2920                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
2921                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
2922                 goto restart;
2923         }
2924         return(ncp->nc_error);
2925 }
2926
2927 /*
2928  * Resolve the ncp associated with a mount point.  Such ncp's almost always
2929  * remain resolved and this routine is rarely called.  NFS MPs tends to force
2930  * re-resolution more often due to its mac-truck-smash-the-namecache
2931  * method of tracking namespace changes.
2932  *
2933  * The semantics for this call is that the passed ncp must be locked on
2934  * entry and will be locked on return.  However, if we actually have to
2935  * resolve the mount point we temporarily unlock the entry in order to
2936  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
2937  * the unlock we have to recheck the flags after we relock.
2938  */
2939 static int
2940 cache_resolve_mp(struct mount *mp)
2941 {
2942         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
2943         struct vnode *vp;
2944         int error;
2945
2946         KKASSERT(mp != NULL);
2947
2948         /*
2949          * If the ncp is already resolved we have nothing to do.  However,
2950          * we do want to guarentee that a usable vnode is returned when
2951          * a vnode is present, so make sure it hasn't been reclaimed.
2952          */
2953         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
2954                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
2955                         _cache_setunresolved(ncp);
2956         }
2957
2958         if (ncp->nc_flag & NCF_UNRESOLVED) {
2959                 _cache_unlock(ncp);
2960                 while (vfs_busy(mp, 0))
2961                         ;
2962                 error = VFS_ROOT(mp, &vp);
2963                 _cache_lock(ncp);
2964
2965                 /*
2966                  * recheck the ncp state after relocking.
2967                  */
2968                 if (ncp->nc_flag & NCF_UNRESOLVED) {
2969                         ncp->nc_error = error;
2970                         if (error == 0) {
2971                                 _cache_setvp(mp, ncp, vp);
2972                                 vput(vp);
2973                         } else {
2974                                 kprintf("[diagnostic] cache_resolve_mp: failed"
2975                                         " to resolve mount %p err=%d ncp=%p\n",
2976                                         mp, error, ncp);
2977                                 _cache_setvp(mp, ncp, NULL);
2978                         }
2979                 } else if (error == 0) {
2980                         vput(vp);
2981                 }
2982                 vfs_unbusy(mp);
2983         }
2984         return(ncp->nc_error);
2985 }
2986
2987 /*
2988  * Clean out negative cache entries when too many have accumulated.
2989  *
2990  * MPSAFE
2991  */
2992 static void
2993 _cache_cleanneg(int count)
2994 {
2995         struct namecache *ncp;
2996
2997         /*
2998          * Attempt to clean out the specified number of negative cache
2999          * entries.
3000          */
3001         while (count) {
3002                 spin_lock(&ncspin);
3003                 ncp = TAILQ_FIRST(&ncneglist);
3004                 if (ncp == NULL) {
3005                         spin_unlock(&ncspin);
3006                         break;
3007                 }
3008                 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
3009                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
3010                 _cache_hold(ncp);
3011                 spin_unlock(&ncspin);
3012
3013                 /*
3014                  * This can race, so we must re-check that the ncp
3015                  * is on the ncneglist after successfully locking it.
3016                  */
3017                 if (_cache_lock_special(ncp) == 0) {
3018                         if (ncp->nc_vp == NULL &&
3019                             (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3020                                 ncp = cache_zap(ncp, 1);
3021                                 if (ncp)
3022                                         _cache_drop(ncp);
3023                         } else {
3024                                 kprintf("cache_cleanneg: race avoided\n");
3025                                 _cache_unlock(ncp);
3026                         }
3027                 } else {
3028                         _cache_drop(ncp);
3029                 }
3030                 --count;
3031         }
3032 }
3033
3034 /*
3035  * Clean out positive cache entries when too many have accumulated.
3036  *
3037  * MPSAFE
3038  */
3039 static void
3040 _cache_cleanpos(int count)
3041 {
3042         static volatile int rover;
3043         struct nchash_head *nchpp;
3044         struct namecache *ncp;
3045         int rover_copy;
3046
3047         /*
3048          * Attempt to clean out the specified number of negative cache
3049          * entries.
3050          */
3051         while (count) {
3052                 rover_copy = ++rover;   /* MPSAFEENOUGH */
3053                 cpu_ccfence();
3054                 nchpp = NCHHASH(rover_copy);
3055
3056                 spin_lock(&nchpp->spin);
3057                 ncp = LIST_FIRST(&nchpp->list);
3058                 if (ncp)
3059                         _cache_hold(ncp);
3060                 spin_unlock(&nchpp->spin);
3061
3062                 if (ncp) {
3063                         if (_cache_lock_special(ncp) == 0) {
3064                                 ncp = cache_zap(ncp, 1);
3065                                 if (ncp)
3066                                         _cache_drop(ncp);
3067                         } else {
3068                                 _cache_drop(ncp);
3069                         }
3070                 }
3071                 --count;
3072         }
3073 }
3074
3075 /*
3076  * This is a kitchen sink function to clean out ncps which we
3077  * tried to zap from cache_drop() but failed because we were
3078  * unable to acquire the parent lock.
3079  *
3080  * Such entries can also be removed via cache_inval_vp(), such
3081  * as when unmounting.
3082  *
3083  * MPSAFE
3084  */
3085 static void
3086 _cache_cleandefered(void)
3087 {
3088         struct nchash_head *nchpp;
3089         struct namecache *ncp;
3090         struct namecache dummy;
3091         int i;
3092
3093         numdefered = 0;
3094         bzero(&dummy, sizeof(dummy));
3095         dummy.nc_flag = NCF_DESTROYED;
3096
3097         for (i = 0; i <= nchash; ++i) {
3098                 nchpp = &nchashtbl[i];
3099
3100                 spin_lock(&nchpp->spin);
3101                 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
3102                 ncp = &dummy;
3103                 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) {
3104                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
3105                                 continue;
3106                         LIST_REMOVE(&dummy, nc_hash);
3107                         LIST_INSERT_AFTER(ncp, &dummy, nc_hash);
3108                         _cache_hold(ncp);
3109                         spin_unlock(&nchpp->spin);
3110                         if (_cache_lock_nonblock(ncp) == 0) {
3111                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
3112                                 _cache_unlock(ncp);
3113                         }
3114                         _cache_drop(ncp);
3115                         spin_lock(&nchpp->spin);
3116                         ncp = &dummy;
3117                 }
3118                 LIST_REMOVE(&dummy, nc_hash);
3119                 spin_unlock(&nchpp->spin);
3120         }
3121 }
3122
3123 /*
3124  * Name cache initialization, from vfsinit() when we are booting
3125  */
3126 void
3127 nchinit(void)
3128 {
3129         int i;
3130         globaldata_t gd;
3131
3132         /* initialise per-cpu namecache effectiveness statistics. */
3133         for (i = 0; i < ncpus; ++i) {
3134                 gd = globaldata_find(i);
3135                 gd->gd_nchstats = &nchstats[i];
3136         }
3137         TAILQ_INIT(&ncneglist);
3138         spin_init(&ncspin);
3139         nchashtbl = hashinit_ext(desiredvnodes / 2,
3140                                  sizeof(struct nchash_head),
3141                                  M_VFSCACHE, &nchash);
3142         for (i = 0; i <= (int)nchash; ++i) {
3143                 LIST_INIT(&nchashtbl[i].list);
3144                 spin_init(&nchashtbl[i].spin);
3145         }
3146         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
3147                 spin_init(&ncmount_cache[i].spin);
3148         nclockwarn = 5 * hz;
3149 }
3150
3151 /*
3152  * Called from start_init() to bootstrap the root filesystem.  Returns
3153  * a referenced, unlocked namecache record.
3154  */
3155 void
3156 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
3157 {
3158         nch->ncp = cache_alloc(0);
3159         nch->mount = mp;
3160         atomic_add_int(&mp->mnt_refs, 1);
3161         if (vp)
3162                 _cache_setvp(nch->mount, nch->ncp, vp);
3163 }
3164
3165 /*
3166  * vfs_cache_setroot()
3167  *
3168  *      Create an association between the root of our namecache and
3169  *      the root vnode.  This routine may be called several times during
3170  *      booting.
3171  *
3172  *      If the caller intends to save the returned namecache pointer somewhere
3173  *      it must cache_hold() it.
3174  */
3175 void
3176 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
3177 {
3178         struct vnode *ovp;
3179         struct nchandle onch;
3180
3181         ovp = rootvnode;
3182         onch = rootnch;
3183         rootvnode = nvp;
3184         if (nch)
3185                 rootnch = *nch;
3186         else
3187                 cache_zero(&rootnch);
3188         if (ovp)
3189                 vrele(ovp);
3190         if (onch.ncp)
3191                 cache_drop(&onch);
3192 }
3193
3194 /*
3195  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
3196  * topology and is being removed as quickly as possible.  The new VOP_N*()
3197  * API calls are required to make specific adjustments using the supplied
3198  * ncp pointers rather then just bogusly purging random vnodes.
3199  *
3200  * Invalidate all namecache entries to a particular vnode as well as
3201  * any direct children of that vnode in the namecache.  This is a
3202  * 'catch all' purge used by filesystems that do not know any better.
3203  *
3204  * Note that the linkage between the vnode and its namecache entries will
3205  * be removed, but the namecache entries themselves might stay put due to
3206  * active references from elsewhere in the system or due to the existance of
3207  * the children.   The namecache topology is left intact even if we do not
3208  * know what the vnode association is.  Such entries will be marked
3209  * NCF_UNRESOLVED.
3210  */
3211 void
3212 cache_purge(struct vnode *vp)
3213 {
3214         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
3215 }
3216
3217 /*
3218  * Flush all entries referencing a particular filesystem.
3219  *
3220  * Since we need to check it anyway, we will flush all the invalid
3221  * entries at the same time.
3222  */
3223 #if 0
3224
3225 void
3226 cache_purgevfs(struct mount *mp)
3227 {
3228         struct nchash_head *nchpp;
3229         struct namecache *ncp, *nnp;
3230
3231         /*
3232          * Scan hash tables for applicable entries.
3233          */
3234         for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
3235                 spin_lock_wr(&nchpp->spin); XXX
3236                 ncp = LIST_FIRST(&nchpp->list);
3237                 if (ncp)
3238                         _cache_hold(ncp);
3239                 while (ncp) {
3240                         nnp = LIST_NEXT(ncp, nc_hash);
3241                         if (nnp)
3242                                 _cache_hold(nnp);
3243                         if (ncp->nc_mount == mp) {
3244                                 _cache_lock(ncp);
3245                                 ncp = cache_zap(ncp, 0);
3246                                 if (ncp)
3247                                         _cache_drop(ncp);
3248                         } else {
3249                                 _cache_drop(ncp);
3250                         }
3251                         ncp = nnp;
3252                 }
3253                 spin_unlock_wr(&nchpp->spin); XXX
3254         }
3255 }
3256
3257 #endif
3258
3259 static int disablecwd;
3260 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
3261     "Disable getcwd");
3262
3263 static u_long numcwdcalls;
3264 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
3265     "Number of current directory resolution calls");
3266 static u_long numcwdfailnf;
3267 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
3268     "Number of current directory failures due to lack of file");
3269 static u_long numcwdfailsz;
3270 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
3271     "Number of current directory failures due to large result");
3272 static u_long numcwdfound;
3273 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
3274     "Number of current directory resolution successes");
3275
3276 /*
3277  * MPALMOSTSAFE
3278  */
3279 int
3280 sys___getcwd(struct __getcwd_args *uap)
3281 {
3282         u_int buflen;
3283         int error;
3284         char *buf;
3285         char *bp;
3286
3287         if (disablecwd)
3288                 return (ENODEV);
3289
3290         buflen = uap->buflen;
3291         if (buflen == 0)
3292                 return (EINVAL);
3293         if (buflen > MAXPATHLEN)
3294                 buflen = MAXPATHLEN;
3295
3296         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
3297         bp = kern_getcwd(buf, buflen, &error);
3298         if (error == 0)
3299                 error = copyout(bp, uap->buf, strlen(bp) + 1);
3300         kfree(buf, M_TEMP);
3301         return (error);
3302 }
3303
3304 char *
3305 kern_getcwd(char *buf, size_t buflen, int *error)
3306 {
3307         struct proc *p = curproc;
3308         char *bp;
3309         int i, slash_prefixed;
3310         struct filedesc *fdp;
3311         struct nchandle nch;
3312         struct namecache *ncp;
3313
3314         numcwdcalls++;
3315         bp = buf;
3316         bp += buflen - 1;
3317         *bp = '\0';
3318         fdp = p->p_fd;
3319         slash_prefixed = 0;
3320
3321         nch = fdp->fd_ncdir;
3322         ncp = nch.ncp;
3323         if (ncp)
3324                 _cache_hold(ncp);
3325
3326         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
3327                nch.mount != fdp->fd_nrdir.mount)
3328         ) {
3329                 /*
3330                  * While traversing upwards if we encounter the root
3331                  * of the current mount we have to skip to the mount point
3332                  * in the underlying filesystem.
3333                  */
3334                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
3335                         nch = nch.mount->mnt_ncmounton;
3336                         _cache_drop(ncp);
3337                         ncp = nch.ncp;
3338                         if (ncp)
3339                                 _cache_hold(ncp);
3340                         continue;
3341                 }
3342
3343                 /*
3344                  * Prepend the path segment
3345                  */
3346                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
3347                         if (bp == buf) {
3348                                 numcwdfailsz++;
3349                                 *error = ERANGE;
3350                                 bp = NULL;
3351                                 goto done;
3352                         }
3353                         *--bp = ncp->nc_name[i];
3354                 }
3355                 if (bp == buf) {
3356                         numcwdfailsz++;
3357                         *error = ERANGE;
3358                         bp = NULL;
3359                         goto done;
3360                 }
3361                 *--bp = '/';
3362                 slash_prefixed = 1;
3363
3364                 /*
3365                  * Go up a directory.  This isn't a mount point so we don't
3366                  * have to check again.
3367                  */
3368                 while ((nch.ncp = ncp->nc_parent) != NULL) {
3369                         _cache_lock(ncp);
3370                         if (nch.ncp != ncp->nc_parent) {
3371                                 _cache_unlock(ncp);
3372                                 continue;
3373                         }
3374                         _cache_hold(nch.ncp);
3375                         _cache_unlock(ncp);
3376                         break;
3377                 }
3378                 _cache_drop(ncp);
3379                 ncp = nch.ncp;
3380         }
3381         if (ncp == NULL) {
3382                 numcwdfailnf++;
3383                 *error = ENOENT;
3384                 bp = NULL;
3385                 goto done;
3386         }
3387         if (!slash_prefixed) {
3388                 if (bp == buf) {
3389                         numcwdfailsz++;
3390                         *error = ERANGE;
3391                         bp = NULL;
3392                         goto done;
3393                 }
3394                 *--bp = '/';
3395         }
3396         numcwdfound++;
3397         *error = 0;
3398 done:
3399         if (ncp)
3400                 _cache_drop(ncp);
3401         return (bp);
3402 }
3403
3404 /*
3405  * Thus begins the fullpath magic.
3406  *
3407  * The passed nchp is referenced but not locked.
3408  */
3409 static int disablefullpath;
3410 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
3411     &disablefullpath, 0,
3412     "Disable fullpath lookups");
3413
3414 static u_int numfullpathcalls;
3415 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD,
3416     &numfullpathcalls, 0,
3417     "Number of full path resolutions in progress");
3418 static u_int numfullpathfailnf;
3419 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD,
3420     &numfullpathfailnf, 0,
3421     "Number of full path resolution failures due to lack of file");
3422 static u_int numfullpathfailsz;
3423 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD,
3424     &numfullpathfailsz, 0,
3425     "Number of full path resolution failures due to insufficient memory");
3426 static u_int numfullpathfound;
3427 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD,
3428     &numfullpathfound, 0,
3429     "Number of full path resolution successes");
3430
3431 int
3432 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
3433                char **retbuf, char **freebuf, int guess)
3434 {
3435         struct nchandle fd_nrdir;
3436         struct nchandle nch;
3437         struct namecache *ncp;
3438         struct mount *mp, *new_mp;
3439         char *bp, *buf;
3440         int slash_prefixed;
3441         int error = 0;
3442         int i;
3443
3444         atomic_add_int(&numfullpathcalls, -1);
3445
3446         *retbuf = NULL;
3447         *freebuf = NULL;
3448
3449         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
3450         bp = buf + MAXPATHLEN - 1;
3451         *bp = '\0';
3452         if (nchbase)
3453                 fd_nrdir = *nchbase;
3454         else if (p != NULL)
3455                 fd_nrdir = p->p_fd->fd_nrdir;
3456         else
3457                 fd_nrdir = rootnch;
3458         slash_prefixed = 0;
3459         nch = *nchp;
3460         ncp = nch.ncp;
3461         if (ncp)
3462                 _cache_hold(ncp);
3463         mp = nch.mount;
3464
3465         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
3466                 new_mp = NULL;
3467
3468                 /*
3469                  * If we are asked to guess the upwards path, we do so whenever
3470                  * we encounter an ncp marked as a mountpoint. We try to find
3471                  * the actual mountpoint by finding the mountpoint with this
3472                  * ncp.
3473                  */
3474                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
3475                         new_mp = mount_get_by_nc(ncp);
3476                 }
3477                 /*
3478                  * While traversing upwards if we encounter the root
3479                  * of the current mount we have to skip to the mount point.
3480                  */
3481                 if (ncp == mp->mnt_ncmountpt.ncp) {
3482                         new_mp = mp;
3483                 }
3484                 if (new_mp) {
3485                         nch = new_mp->mnt_ncmounton;
3486                         _cache_drop(ncp);
3487                         ncp = nch.ncp;
3488                         if (ncp)
3489                                 _cache_hold(ncp);
3490                         mp = nch.mount;
3491                         continue;
3492                 }
3493
3494                 /*
3495                  * Prepend the path segment
3496                  */
3497                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
3498                         if (bp == buf) {
3499                                 numfullpathfailsz++;
3500                                 kfree(buf, M_TEMP);
3501                                 error = ENOMEM;
3502                                 goto done;
3503                         }
3504                         *--bp = ncp->nc_name[i];
3505                 }
3506                 if (bp == buf) {
3507                         numfullpathfailsz++;
3508                         kfree(buf, M_TEMP);
3509                         error = ENOMEM;
3510                         goto done;
3511                 }
3512                 *--bp = '/';
3513                 slash_prefixed = 1;
3514
3515                 /*
3516                  * Go up a directory.  This isn't a mount point so we don't
3517                  * have to check again.
3518                  *
3519                  * We can only safely access nc_parent with ncp held locked.
3520                  */
3521                 while ((nch.ncp = ncp->nc_parent) != NULL) {
3522                         _cache_lock(ncp);
3523                         if (nch.ncp != ncp->nc_parent) {
3524                                 _cache_unlock(ncp);
3525                                 continue;
3526                         }
3527                         _cache_hold(nch.ncp);
3528                         _cache_unlock(ncp);
3529                         break;
3530                 }
3531                 _cache_drop(ncp);
3532                 ncp = nch.ncp;
3533         }
3534         if (ncp == NULL) {
3535                 numfullpathfailnf++;
3536                 kfree(buf, M_TEMP);
3537                 error = ENOENT;
3538                 goto done;
3539         }
3540
3541         if (!slash_prefixed) {
3542                 if (bp == buf) {
3543                         numfullpathfailsz++;
3544                         kfree(buf, M_TEMP);
3545                         error = ENOMEM;
3546                         goto done;
3547                 }
3548                 *--bp = '/';
3549         }
3550         numfullpathfound++;
3551         *retbuf = bp;
3552         *freebuf = buf;
3553         error = 0;
3554 done:
3555         if (ncp)
3556                 _cache_drop(ncp);
3557         return(error);
3558 }
3559
3560 int
3561 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, char **freebuf,
3562     int guess)
3563 {
3564         struct namecache *ncp;
3565         struct nchandle nch;
3566         int error;
3567
3568         *freebuf = NULL;
3569         atomic_add_int(&numfullpathcalls, 1);
3570         if (disablefullpath)
3571                 return (ENODEV);
3572
3573         if (p == NULL)
3574                 return (EINVAL);
3575
3576         /* vn is NULL, client wants us to use p->p_textvp */
3577         if (vn == NULL) {
3578                 if ((vn = p->p_textvp) == NULL)
3579                         return (EINVAL);
3580         }
3581         spin_lock(&vn->v_spin);
3582         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
3583                 if (ncp->nc_nlen)
3584                         break;
3585         }
3586         if (ncp == NULL) {
3587                 spin_unlock(&vn->v_spin);
3588                 return (EINVAL);
3589         }
3590         _cache_hold(ncp);
3591         spin_unlock(&vn->v_spin);
3592
3593         atomic_add_int(&numfullpathcalls, -1);
3594         nch.ncp = ncp;
3595         nch.mount = vn->v_mount;
3596         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
3597         _cache_drop(ncp);
3598         return (error);
3599 }