2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * Copyright (c) 1989, 1993, 1995
35 * The Regents of the University of California. All rights reserved.
37 * This code is derived from software contributed to Berkeley by
38 * Poul-Henning Kamp of the FreeBSD Project.
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 * 3. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 #include <sys/param.h>
66 #include <sys/systm.h>
68 #include <sys/kernel.h>
69 #include <sys/sysctl.h>
70 #include <sys/mount.h>
71 #include <sys/vnode.h>
72 #include <sys/malloc.h>
73 #include <sys/sysproto.h>
74 #include <sys/spinlock.h>
76 #include <sys/nlookup.h>
77 #include <sys/filedesc.h>
78 #include <sys/fnv_hash.h>
79 #include <sys/globaldata.h>
80 #include <sys/kern_syscall.h>
81 #include <sys/dirent.h>
84 #include <sys/spinlock2.h>
86 #define MAX_RECURSION_DEPTH 64
89 * Random lookups in the cache are accomplished with a hash table using
90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock,
91 * but we use the ncp->update counter trick to avoid acquiring any
92 * contestable spin-locks during a lookup.
94 * Negative entries may exist and correspond to resolved namecache
95 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT
96 * will be set if the entry corresponds to a whited-out directory entry
97 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list
98 * is locked via pcpu_ncache[n].neg_spin;
102 * (1) ncp's typically have at least a nc_refs of 1, and usually 2. One
103 * is applicable to direct lookups via the hash table nchpp or via
104 * nc_list (the two are added or removed together). Removal of the ncp
105 * from the hash table drops this reference. The second is applicable
106 * to vp->v_namecache linkages (or negative list linkages), and removal
107 * of the ncp from these lists drops this reference.
109 * On the 1->0 transition of nc_refs the ncp can no longer be referenced
110 * and must be destroyed. No other thread should have access to it at
111 * this point so it can be safely locked and freed without any deadlock
114 * The 1->0 transition can occur at almost any juncture and so cache_drop()
115 * deals with it directly.
117 * (2) Once the 1->0 transition occurs, the entity that caused the transition
118 * will be responsible for destroying the ncp. The ncp cannot be on any
119 * list or hash at this time, or be held by anyone other than the caller
120 * responsible for the transition.
122 * (3) A ncp must be locked in order to modify it.
124 * (5) ncp locks are ordered, child-to-parent. Child first, then parent.
125 * This may seem backwards but forward-scans use the hash table and thus
126 * can hold the parent unlocked while traversing downward. Deletions,
127 * on the other-hand, tend to propagate bottom-up since the ref on the
128 * is dropped as the children go away.
130 * (6) Both parent and child must be locked in order to enter the child onto
131 * the parent's nc_list.
135 * Structures associated with name cacheing.
137 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash])
140 #define NCMOUNT_NUMCACHE (16384) /* power of 2 */
141 #define NCMOUNT_SET (8) /* power of 2 */
143 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
145 TAILQ_HEAD(nchash_list, namecache);
148 * Don't cachealign, but at least pad to 32 bytes so entries
149 * don't cross a cache line.
152 struct nchash_list list; /* 16 bytes */
153 struct spinlock spin; /* 8 bytes */
154 long pad01; /* 8 bytes */
157 struct ncmount_cache {
158 struct spinlock spin;
159 struct namecache *ncp;
161 struct mount *mp_target;
169 struct spinlock umount_spin; /* cache_findmount/interlock */
170 struct spinlock neg_spin; /* for neg_list and neg_count */
171 struct namecache_list neg_list;
179 __read_mostly static struct nchash_head *nchashtbl;
180 __read_mostly static struct pcpu_ncache *pcpu_ncache;
181 static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE];
184 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server
185 * to create the namecache infrastructure leading to a dangling vnode.
187 * 0 Only errors are reported
188 * 1 Successes are reported
189 * 2 Successes + the whole directory scan is reported
190 * 3 Force the directory scan code run as if the parent vnode did not
191 * have a namecache record, even if it does have one.
193 __read_mostly static int ncvp_debug;
194 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
195 "Namecache debug level (0-3)");
197 __read_mostly static u_long nchash; /* size of hash table */
198 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
199 "Size of namecache hash table");
201 __read_mostly static int ncnegflush = 10; /* burst for negative flush */
202 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
203 "Batch flush negative entries");
205 __read_mostly static int ncposflush = 10; /* burst for positive flush */
206 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
207 "Batch flush positive entries");
209 __read_mostly static int ncnegfactor = 16; /* ratio of negative entries */
210 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
211 "Ratio of namecache negative entries");
213 __read_mostly static int nclockwarn; /* warn on locked entries in ticks */
214 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
215 "Warn on locked namecache entries in ticks");
217 __read_mostly static int ncposlimit; /* number of cache entries allocated */
218 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
219 "Number of cache entries allocated");
221 __read_mostly static int ncp_shared_lock_disable = 0;
222 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
223 &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
225 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
226 "sizeof(struct vnode)");
227 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
228 "sizeof(struct namecache)");
230 __read_mostly static int ncmount_cache_enable = 1;
231 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
232 &ncmount_cache_enable, 0, "mount point cache");
234 static __inline void _cache_drop(struct namecache *ncp);
235 static int cache_resolve_mp(struct mount *mp);
236 static int cache_findmount_callback(struct mount *mp, void *data);
237 static void _cache_setunresolved(struct namecache *ncp);
238 static void _cache_cleanneg(long count);
239 static void _cache_cleanpos(long count);
240 static void _cache_cleandefered(void);
241 static void _cache_unlink(struct namecache *ncp);
244 * The new name cache statistics (these are rolled up globals and not
245 * modified in the critical path, see struct pcpu_ncache).
247 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
248 static long vfscache_negs;
249 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0,
250 "Number of negative namecache entries");
251 static long vfscache_count;
252 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0,
253 "Number of namecaches entries");
254 static long vfscache_leafs;
255 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0,
256 "Number of namecaches entries");
257 static long numdefered;
258 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
259 "Number of cache entries allocated");
262 struct nchstats nchstats[SMP_MAXCPU];
264 * Export VFS cache effectiveness statistics to user-land.
266 * The statistics are left for aggregation to user-land so
267 * neat things can be achieved, like observing per-CPU cache
271 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
273 struct globaldata *gd;
277 for (i = 0; i < ncpus; ++i) {
278 gd = globaldata_find(i);
279 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
280 sizeof(struct nchstats))))
286 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
287 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
289 static void cache_zap(struct namecache *ncp);
292 * Cache mount points and namecache records in order to avoid unnecessary
293 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP
294 * performance and is particularly important on multi-socket systems to
295 * reduce cache-line ping-ponging.
297 * Try to keep the pcpu structure within one cache line (~64 bytes).
299 #define MNTCACHE_COUNT 32 /* power of 2, multiple of SET */
300 #define MNTCACHE_SET 8 /* set associativity */
302 struct mntcache_elm {
303 struct namecache *ncp;
310 struct mntcache_elm array[MNTCACHE_COUNT];
313 static struct mntcache pcpu_mntcache[MAXCPU];
316 struct mntcache_elm *
317 _cache_mntcache_hash(void *ptr)
319 struct mntcache_elm *elm;
322 hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1);
323 elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)];
330 _cache_mntref(struct mount *mp)
332 struct mntcache_elm *elm;
336 elm = _cache_mntcache_hash(mp);
337 for (i = 0; i < MNTCACHE_SET; ++i) {
339 mpr = atomic_swap_ptr((void *)&elm->mp, NULL);
340 if (__predict_true(mpr == mp))
343 atomic_add_int(&mpr->mnt_refs, -1);
347 atomic_add_int(&mp->mnt_refs, 1);
352 _cache_mntrel(struct mount *mp)
354 struct mntcache_elm *elm;
355 struct mntcache_elm *best;
361 elm = _cache_mntcache_hash(mp);
363 for (i = 0; i < MNTCACHE_SET; ++i) {
364 if (elm->mp == NULL) {
365 mpr = atomic_swap_ptr((void *)&elm->mp, mp);
366 if (__predict_false(mpr != NULL)) {
367 atomic_add_int(&mpr->mnt_refs, -1);
372 delta1 = ticks - best->ticks;
373 delta2 = ticks - elm->ticks;
374 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
378 mpr = atomic_swap_ptr((void *)&best->mp, mp);
381 atomic_add_int(&mpr->mnt_refs, -1);
385 * Clears all cached mount points on all cpus. This routine should only
386 * be called when we are waiting for a mount to clear, e.g. so we can
390 cache_clearmntcache(struct mount *target __unused)
394 for (n = 0; n < ncpus; ++n) {
395 struct mntcache *cache = &pcpu_mntcache[n];
396 struct mntcache_elm *elm;
397 struct namecache *ncp;
401 for (i = 0; i < MNTCACHE_COUNT; ++i) {
402 elm = &cache->array[i];
404 mp = atomic_swap_ptr((void *)&elm->mp, NULL);
406 atomic_add_int(&mp->mnt_refs, -1);
409 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL);
418 * Namespace locking. The caller must already hold a reference to the
419 * namecache structure in order to lock/unlock it. The controlling entity
420 * in a 1->0 transition does not need to lock the ncp to dispose of it,
421 * as nobody else will have visiblity to it at that point.
423 * Note that holding a locked namecache structure prevents other threads
424 * from making namespace changes (e.g. deleting or creating), prevents
425 * vnode association state changes by other threads, and prevents the
426 * namecache entry from being resolved or unresolved by other threads.
428 * An exclusive lock owner has full authority to associate/disassociate
429 * vnodes and resolve/unresolve the locked ncp.
431 * A shared lock owner only has authority to acquire the underlying vnode,
434 * The primary lock field is nc_lockstatus. nc_locktd is set after the
435 * fact (when locking) or cleared prior to unlocking.
437 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed
438 * or recycled, but it does NOT help you if the vnode had already
439 * initiated a recyclement. If this is important, use cache_get()
440 * rather then cache_lock() (and deal with the differences in the
441 * way the refs counter is handled). Or, alternatively, make an
442 * unconditional call to cache_validate() or cache_resolve()
443 * after cache_lock() returns.
447 _cache_lock(struct namecache *ncp)
452 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
453 while (__predict_false(error == EWOULDBLOCK)) {
455 didwarn = ticks - nclockwarn;
456 kprintf("[diagnostic] cache_lock: "
459 curthread->td_comm, ncp,
460 ncp->nc_nlen, ncp->nc_nlen,
463 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK);
465 if (__predict_false(didwarn)) {
466 kprintf("[diagnostic] cache_lock: "
467 "%s unblocked %*.*s after %d secs\n",
469 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
470 (int)(ticks - didwarn) / hz);
475 * Release a previously acquired lock.
477 * A concurrent shared-lock acquisition or acquisition/release can
478 * race bit 31 so only drop the ncp if bit 31 was set.
482 _cache_unlock(struct namecache *ncp)
484 lockmgr(&ncp->nc_lock, LK_RELEASE);
488 * Lock ncp exclusively, non-blocking. Return 0 on success.
492 _cache_lock_nonblock(struct namecache *ncp)
496 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT);
497 if (__predict_false(error != 0)) {
504 * This is a special form of _cache_lock() which only succeeds if
505 * it can get a pristine, non-recursive lock. The caller must have
506 * already ref'd the ncp.
508 * On success the ncp will be locked, on failure it will not. The
509 * ref count does not change either way.
511 * We want _cache_lock_special() (on success) to return a definitively
512 * usable vnode or a definitively unresolved ncp.
516 _cache_lock_special(struct namecache *ncp)
518 if (_cache_lock_nonblock(ncp) == 0) {
519 if (lockmgr_oneexcl(&ncp->nc_lock)) {
520 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
521 _cache_setunresolved(ncp);
530 * Shared lock, guarantees vp held
532 * The shared lock holds vp on the 0->1 transition. It is possible to race
533 * another shared lock release, preventing the other release from dropping
534 * the vnode and clearing bit 31.
536 * If it is not set then we are responsible for setting it, and this
537 * responsibility does not race with anyone else.
541 _cache_lock_shared(struct namecache *ncp)
546 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
547 while (__predict_false(error == EWOULDBLOCK)) {
549 didwarn = ticks - nclockwarn;
550 kprintf("[diagnostic] cache_lock_shared: "
553 curthread->td_comm, ncp,
554 ncp->nc_nlen, ncp->nc_nlen,
557 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
559 if (__predict_false(didwarn)) {
560 kprintf("[diagnostic] cache_lock_shared: "
561 "%s unblocked %*.*s after %d secs\n",
563 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
564 (int)(ticks - didwarn) / hz);
569 * Shared lock, guarantees vp held. Non-blocking. Returns 0 on success
573 _cache_lock_shared_nonblock(struct namecache *ncp)
577 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT);
578 if (__predict_false(error != 0)) {
585 * This function tries to get a shared lock but will back-off to an
588 * (1) Some other thread is trying to obtain an exclusive lock
589 * (to prevent the exclusive requester from getting livelocked out
590 * by many shared locks).
592 * (2) The current thread already owns an exclusive lock (to avoid
595 * WARNING! On machines with lots of cores we really want to try hard to
596 * get a shared lock or concurrent path lookups can chain-react
597 * into a very high-latency exclusive lock.
599 * This is very evident in dsynth's initial scans.
603 _cache_lock_shared_special(struct namecache *ncp)
606 * Only honor a successful shared lock (returning 0) if there is
607 * no exclusive request pending and the vnode, if present, is not
608 * in a reclaimed state.
610 if (_cache_lock_shared_nonblock(ncp) == 0) {
611 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) {
612 if (ncp->nc_vp == NULL ||
613 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
622 * Non-blocking shared lock failed. If we already own the exclusive
623 * lock just acquire another exclusive lock (instead of deadlocking).
624 * Otherwise acquire a shared lock.
626 if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) {
630 _cache_lock_shared(ncp);
636 _cache_lockstatus(struct namecache *ncp)
640 status = lockstatus(&ncp->nc_lock, curthread);
641 if (status == 0 || status == LK_EXCLOTHER)
647 * cache_hold() and cache_drop() prevent the premature deletion of a
648 * namecache entry but do not prevent operations (such as zapping) on
649 * that namecache entry.
651 * This routine may only be called from outside this source module if
652 * nc_refs is already deterministically at least 1, such as being
653 * associated with e.g. a process, file descriptor, or some other entity.
655 * Only the above situations, similar situations within this module where
656 * the ref count is deterministically at least 1, or when the ncp is found
657 * via the nchpp (hash table) lookup, can bump nc_refs.
659 * Very specifically, a ncp found via nc_list CANNOT bump nc_refs. It
660 * can still be removed from the nc_list, however, as long as the caller
661 * can acquire its lock (in the wrong order).
663 * This is a rare case where callers are allowed to hold a spinlock,
664 * so we can't ourselves.
668 _cache_hold(struct namecache *ncp)
670 KKASSERT(ncp->nc_refs > 0);
671 atomic_add_int(&ncp->nc_refs, 1);
677 * Drop a cache entry.
679 * The 1->0 transition is special and requires the caller to destroy the
680 * entry. It means that the ncp is no longer on a nchpp list (since that
681 * would mean there was stilla ref). The ncp could still be on a nc_list
682 * but will not have any child of its own, again because nc_refs is now 0
683 * and children would have a ref to their parent.
685 * Once the 1->0 transition is made, nc_refs cannot be incremented again.
689 _cache_drop(struct namecache *ncp)
691 if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) {
693 * Executed unlocked (no need to lock on last drop)
695 _cache_setunresolved(ncp);
700 ncp->nc_refs = -1; /* safety */
702 kfree(ncp->nc_name, M_VFSCACHE);
703 kfree(ncp, M_VFSCACHE);
708 * Link a new namecache entry to its parent and to the hash table. Be
709 * careful to avoid races if vhold() blocks in the future.
711 * Both ncp and par must be referenced and locked. The reference is
712 * transfered to the nchpp (and, most notably, NOT to the parent list).
714 * NOTE: The hash table spinlock is held across this call, we can't do
718 _cache_link_parent(struct namecache *ncp, struct namecache *par,
719 struct nchash_head *nchpp)
721 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
723 KKASSERT(ncp->nc_parent == NULL);
724 ncp->nc_parent = par;
725 ncp->nc_head = nchpp;
728 * Set inheritance flags. Note that the parent flags may be
729 * stale due to getattr potentially not having been run yet
730 * (it gets run during nlookup()'s).
732 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
733 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
734 ncp->nc_flag |= NCF_SF_PNOCACHE;
735 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
736 ncp->nc_flag |= NCF_UF_PCACHE;
739 * Add to hash table and parent, adjust accounting
741 TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
742 atomic_add_long(&pn->vfscache_count, 1);
743 if (TAILQ_EMPTY(&ncp->nc_list))
744 atomic_add_long(&pn->vfscache_leafs, 1);
746 if (TAILQ_EMPTY(&par->nc_list)) {
747 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
748 atomic_add_long(&pn->vfscache_leafs, -1);
750 * Any vp associated with an ncp which has children must
751 * be held to prevent it from being recycled.
756 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
758 _cache_hold(par); /* add nc_parent ref */
762 * Remove the parent and hash associations from a namecache structure.
763 * Drop the ref-count on the parent. The caller receives the ref
764 * from the ncp's nchpp linkage that was removed and may forward that
765 * ref to a new linkage.
767 * The caller usually holds an additional ref * on the ncp so the unlink
768 * cannot be the final drop. XXX should not be necessary now since the
769 * caller receives the ref from the nchpp linkage, assuming the ncp
770 * was linked in the first place.
772 * ncp must be locked, which means that there won't be any nc_parent
773 * removal races. This routine will acquire a temporary lock on
774 * the parent as well as the appropriate hash chain.
777 _cache_unlink_parent(struct namecache *ncp)
779 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
780 struct namecache *par;
781 struct vnode *dropvp;
782 struct nchash_head *nchpp;
784 if ((par = ncp->nc_parent) != NULL) {
786 KKASSERT(ncp->nc_parent == par);
788 /* don't add a ref, we drop the nchpp ref later */
790 nchpp = ncp->nc_head;
791 spin_lock(&nchpp->spin);
794 * Remove from hash table and parent, adjust accounting
796 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
797 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
798 atomic_add_long(&pn->vfscache_count, -1);
799 if (TAILQ_EMPTY(&ncp->nc_list))
800 atomic_add_long(&pn->vfscache_leafs, -1);
803 if (TAILQ_EMPTY(&par->nc_list)) {
804 atomic_add_long(&pn->vfscache_leafs, 1);
808 ncp->nc_parent = NULL;
810 spin_unlock(&nchpp->spin);
812 _cache_drop(par); /* drop nc_parent ref */
815 * We can only safely vdrop with no spinlocks held.
823 * Allocate a new namecache structure. Most of the code does not require
824 * zero-termination of the string but it makes vop_compat_ncreate() easier.
826 * The returned ncp will be locked and referenced. The ref is generally meant
827 * to be transfered to the nchpp linkage.
829 static struct namecache *
830 cache_alloc(int nlen)
832 struct namecache *ncp;
834 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
836 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
838 ncp->nc_flag = NCF_UNRESOLVED;
839 ncp->nc_error = ENOTCONN; /* needs to be resolved */
841 TAILQ_INIT(&ncp->nc_list);
842 lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE);
843 lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
849 * Can only be called for the case where the ncp has never been
850 * associated with anything (so no spinlocks are needed).
853 _cache_free(struct namecache *ncp)
855 KKASSERT(ncp->nc_refs == 1);
857 kfree(ncp->nc_name, M_VFSCACHE);
858 kfree(ncp, M_VFSCACHE);
862 * [re]initialize a nchandle.
865 cache_zero(struct nchandle *nch)
872 * Ref and deref a nchandle structure (ncp + mp)
874 * The caller must specify a stable ncp pointer, typically meaning the
875 * ncp is already referenced but this can also occur indirectly through
876 * e.g. holding a lock on a direct child.
878 * WARNING: Caller may hold an unrelated read spinlock, which means we can't
879 * use read spinlocks here.
882 cache_hold(struct nchandle *nch)
884 _cache_hold(nch->ncp);
885 _cache_mntref(nch->mount);
890 * Create a copy of a namecache handle for an already-referenced
894 cache_copy(struct nchandle *nch, struct nchandle *target)
896 struct namecache *ncp;
898 struct mntcache_elm *elm;
899 struct namecache *ncpr;
907 elm = _cache_mntcache_hash(ncp);
908 for (i = 0; i < MNTCACHE_SET; ++i) {
909 if (elm->ncp == ncp) {
910 ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL);
926 * Drop the nchandle, but try to cache the ref to avoid global atomic
927 * ops. This is typically done on the system root and jail root nchandles.
930 cache_drop_and_cache(struct nchandle *nch, int elmno)
932 struct mntcache_elm *elm;
933 struct mntcache_elm *best;
934 struct namecache *ncpr;
941 _cache_drop(nch->ncp);
945 _cache_mntrel(nch->mount);
951 elm = _cache_mntcache_hash(nch->ncp);
953 for (i = 0; i < MNTCACHE_SET; ++i) {
954 if (elm->ncp == NULL) {
955 ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp);
956 _cache_mntrel(nch->mount);
964 delta1 = ticks - best->ticks;
965 delta2 = ticks - elm->ticks;
966 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
970 ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp);
971 _cache_mntrel(nch->mount);
980 cache_changemount(struct nchandle *nch, struct mount *mp)
983 _cache_mntrel(nch->mount);
988 cache_drop(struct nchandle *nch)
990 _cache_mntrel(nch->mount);
991 _cache_drop(nch->ncp);
997 cache_lockstatus(struct nchandle *nch)
999 return(_cache_lockstatus(nch->ncp));
1003 cache_lock(struct nchandle *nch)
1005 _cache_lock(nch->ncp);
1009 cache_lock_maybe_shared(struct nchandle *nch, int excl)
1011 struct namecache *ncp = nch->ncp;
1013 if (ncp_shared_lock_disable || excl ||
1014 (ncp->nc_flag & NCF_UNRESOLVED)) {
1017 _cache_lock_shared(ncp);
1018 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1019 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1031 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller
1032 * is responsible for checking both for validity on return as they
1033 * may have become invalid.
1035 * We have to deal with potential deadlocks here, just ping pong
1036 * the lock until we get it (we will always block somewhere when
1037 * looping so this is not cpu-intensive).
1039 * which = 0 nch1 not locked, nch2 is locked
1040 * which = 1 nch1 is locked, nch2 is not locked
1043 cache_relock(struct nchandle *nch1, struct ucred *cred1,
1044 struct nchandle *nch2, struct ucred *cred2)
1052 if (cache_lock_nonblock(nch1) == 0) {
1053 cache_resolve(nch1, cred1);
1058 cache_resolve(nch1, cred1);
1061 if (cache_lock_nonblock(nch2) == 0) {
1062 cache_resolve(nch2, cred2);
1067 cache_resolve(nch2, cred2);
1074 cache_lock_nonblock(struct nchandle *nch)
1076 return(_cache_lock_nonblock(nch->ncp));
1080 cache_unlock(struct nchandle *nch)
1082 _cache_unlock(nch->ncp);
1086 * ref-and-lock, unlock-and-deref functions.
1088 * This function is primarily used by nlookup. Even though cache_lock
1089 * holds the vnode, it is possible that the vnode may have already
1090 * initiated a recyclement.
1092 * We want cache_get() to return a definitively usable vnode or a
1093 * definitively unresolved ncp.
1097 _cache_get(struct namecache *ncp)
1101 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1102 _cache_setunresolved(ncp);
1107 * Attempt to obtain a shared lock on the ncp. A shared lock will only
1108 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1109 * valid. Otherwise an exclusive lock will be acquired instead.
1113 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1115 if (ncp_shared_lock_disable || excl ||
1116 (ncp->nc_flag & NCF_UNRESOLVED)) {
1117 return(_cache_get(ncp));
1120 _cache_lock_shared(ncp);
1121 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1122 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1124 ncp = _cache_get(ncp);
1129 ncp = _cache_get(ncp);
1136 * NOTE: The same nchandle can be passed for both arguments.
1139 cache_get(struct nchandle *nch, struct nchandle *target)
1141 KKASSERT(nch->ncp->nc_refs > 0);
1142 target->mount = nch->mount;
1143 target->ncp = _cache_get(nch->ncp);
1144 _cache_mntref(target->mount);
1148 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1150 KKASSERT(nch->ncp->nc_refs > 0);
1151 target->mount = nch->mount;
1152 target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1153 _cache_mntref(target->mount);
1157 * Release a held and locked ncp
1161 _cache_put(struct namecache *ncp)
1168 cache_put(struct nchandle *nch)
1170 _cache_mntrel(nch->mount);
1171 _cache_put(nch->ncp);
1177 * Resolve an unresolved ncp by associating a vnode with it. If the
1178 * vnode is NULL, a negative cache entry is created.
1180 * The ncp should be locked on entry and will remain locked on return.
1184 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
1186 KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) &&
1187 (_cache_lockstatus(ncp) == LK_EXCLUSIVE) &&
1188 ncp->nc_vp == NULL);
1192 * Any vp associated with an ncp which has children must
1193 * be held. Any vp associated with a locked ncp must be held.
1195 if (!TAILQ_EMPTY(&ncp->nc_list))
1197 spin_lock(&vp->v_spin);
1199 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1200 ++vp->v_namecache_count;
1201 _cache_hold(ncp); /* v_namecache assoc */
1202 spin_unlock(&vp->v_spin);
1203 vhold(vp); /* nc_vp */
1206 * Set auxiliary flags
1208 switch(vp->v_type) {
1210 ncp->nc_flag |= NCF_ISDIR;
1213 ncp->nc_flag |= NCF_ISSYMLINK;
1214 /* XXX cache the contents of the symlink */
1223 * XXX: this is a hack to work-around the lack of a real pfs vfs
1227 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1232 * When creating a negative cache hit we set the
1233 * namecache_gen. A later resolve will clean out the
1234 * negative cache hit if the mount point's namecache_gen
1235 * has changed. Used by devfs, could also be used by
1238 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1241 ncp->nc_negcpu = mycpu->gd_cpuid;
1242 spin_lock(&pn->neg_spin);
1243 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
1244 _cache_hold(ncp); /* neg_list assoc */
1246 spin_unlock(&pn->neg_spin);
1247 atomic_add_long(&pn->vfscache_negs, 1);
1249 ncp->nc_error = ENOENT;
1251 VFS_NCPGEN_SET(mp, ncp);
1253 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1257 cache_setvp(struct nchandle *nch, struct vnode *vp)
1259 _cache_setvp(nch->mount, nch->ncp, vp);
1266 cache_settimeout(struct nchandle *nch, int nticks)
1268 struct namecache *ncp = nch->ncp;
1270 if ((ncp->nc_timeout = ticks + nticks) == 0)
1271 ncp->nc_timeout = 1;
1275 * Disassociate the vnode or negative-cache association and mark a
1276 * namecache entry as unresolved again. Note that the ncp is still
1277 * left in the hash table and still linked to its parent.
1279 * The ncp should be locked and refd on entry and will remain locked and refd
1282 * This routine is normally never called on a directory containing children.
1283 * However, NFS often does just that in its rename() code as a cop-out to
1284 * avoid complex namespace operations. This disconnects a directory vnode
1285 * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1291 _cache_setunresolved(struct namecache *ncp)
1295 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1296 ncp->nc_flag |= NCF_UNRESOLVED;
1297 ncp->nc_timeout = 0;
1298 ncp->nc_error = ENOTCONN;
1299 if ((vp = ncp->nc_vp) != NULL) {
1300 spin_lock(&vp->v_spin);
1302 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1303 --vp->v_namecache_count;
1304 spin_unlock(&vp->v_spin);
1307 * Any vp associated with an ncp with children is
1308 * held by that ncp. Any vp associated with ncp
1309 * is held by that ncp. These conditions must be
1310 * undone when the vp is cleared out from the ncp.
1312 if (!TAILQ_EMPTY(&ncp->nc_list))
1316 struct pcpu_ncache *pn;
1318 pn = &pcpu_ncache[ncp->nc_negcpu];
1320 atomic_add_long(&pn->vfscache_negs, -1);
1321 spin_lock(&pn->neg_spin);
1322 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
1324 spin_unlock(&pn->neg_spin);
1326 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1327 _cache_drop(ncp); /* from v_namecache or neg_list */
1332 * The cache_nresolve() code calls this function to automatically
1333 * set a resolved cache element to unresolved if it has timed out
1334 * or if it is a negative cache hit and the mount point namecache_gen
1338 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1341 * Try to zap entries that have timed out. We have
1342 * to be careful here because locked leafs may depend
1343 * on the vnode remaining intact in a parent, so only
1344 * do this under very specific conditions.
1346 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1347 TAILQ_EMPTY(&ncp->nc_list)) {
1352 * If a resolved negative cache hit is invalid due to
1353 * the mount's namecache generation being bumped, zap it.
1355 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1360 * Otherwise we are good
1365 static __inline void
1366 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1369 * Already in an unresolved state, nothing to do.
1371 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1372 if (_cache_auto_unresolve_test(mp, ncp))
1373 _cache_setunresolved(ncp);
1378 cache_setunresolved(struct nchandle *nch)
1380 _cache_setunresolved(nch->ncp);
1384 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1385 * looking for matches. This flag tells the lookup code when it must
1386 * check for a mount linkage and also prevents the directories in question
1387 * from being deleted or renamed.
1391 cache_clrmountpt_callback(struct mount *mp, void *data)
1393 struct nchandle *nch = data;
1395 if (mp->mnt_ncmounton.ncp == nch->ncp)
1397 if (mp->mnt_ncmountpt.ncp == nch->ncp)
1403 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated
1404 * with a mount point.
1407 cache_clrmountpt(struct nchandle *nch)
1411 count = mountlist_scan(cache_clrmountpt_callback, nch,
1412 MNTSCAN_FORWARD | MNTSCAN_NOBUSY |
1415 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1419 * Invalidate portions of the namecache topology given a starting entry.
1420 * The passed ncp is set to an unresolved state and:
1422 * The passed ncp must be referenced and locked. The routine may unlock
1423 * and relock ncp several times, and will recheck the children and loop
1424 * to catch races. When done the passed ncp will be returned with the
1425 * reference and lock intact.
1427 * CINV_DESTROY - Set a flag in the passed ncp entry indicating
1428 * that the physical underlying nodes have been
1429 * destroyed... as in deleted. For example, when
1430 * a directory is removed. This will cause record
1431 * lookups on the name to no longer be able to find
1432 * the record and tells the resolver to return failure
1433 * rather then trying to resolve through the parent.
1435 * The topology itself, including ncp->nc_name,
1438 * This only applies to the passed ncp, if CINV_CHILDREN
1439 * is specified the children are not flagged.
1441 * CINV_CHILDREN - Set all children (recursively) to an unresolved
1444 * Note that this will also have the side effect of
1445 * cleaning out any unreferenced nodes in the topology
1446 * from the leaves up as the recursion backs out.
1448 * Note that the topology for any referenced nodes remains intact, but
1449 * the nodes will be marked as having been destroyed and will be set
1450 * to an unresolved state.
1452 * It is possible for cache_inval() to race a cache_resolve(), meaning that
1453 * the namecache entry may not actually be invalidated on return if it was
1454 * revalidated while recursing down into its children. This code guarentees
1455 * that the node(s) will go through an invalidation cycle, but does not
1456 * guarentee that they will remain in an invalidated state.
1458 * Returns non-zero if a revalidation was detected during the invalidation
1459 * recursion, zero otherwise. Note that since only the original ncp is
1460 * locked the revalidation ultimately can only indicate that the original ncp
1461 * *MIGHT* no have been reresolved.
1463 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1464 * have to avoid blowing out the kernel stack. We do this by saving the
1465 * deep namecache node and aborting the recursion, then re-recursing at that
1466 * node using a depth-first algorithm in order to allow multiple deep
1467 * recursions to chain through each other, then we restart the invalidation
1472 struct namecache *resume_ncp;
1476 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1480 _cache_inval(struct namecache *ncp, int flags)
1482 struct cinvtrack track;
1483 struct namecache *ncp2;
1487 track.resume_ncp = NULL;
1490 r = _cache_inval_internal(ncp, flags, &track);
1491 if (track.resume_ncp == NULL)
1494 while ((ncp2 = track.resume_ncp) != NULL) {
1495 track.resume_ncp = NULL;
1497 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1499 /*_cache_put(ncp2);*/
1508 cache_inval(struct nchandle *nch, int flags)
1510 return(_cache_inval(nch->ncp, flags));
1514 * Helper for _cache_inval(). The passed ncp is refd and locked and
1515 * remains that way on return, but may be unlocked/relocked multiple
1516 * times by the routine.
1519 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1521 struct namecache *nextkid;
1524 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1526 _cache_setunresolved(ncp);
1527 if (flags & CINV_DESTROY) {
1528 ncp->nc_flag |= NCF_DESTROYED;
1529 ++ncp->nc_generation;
1532 while ((flags & CINV_CHILDREN) &&
1533 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1535 struct namecache *kid;
1539 _cache_hold(nextkid);
1540 if (++track->depth > MAX_RECURSION_DEPTH) {
1541 track->resume_ncp = ncp;
1545 while ((kid = nextkid) != NULL) {
1547 * Parent (ncp) must be locked for the iteration.
1550 if (kid->nc_parent != ncp) {
1552 kprintf("cache_inval_internal restartA %s\n",
1557 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1558 _cache_hold(nextkid);
1561 * Parent unlocked for this section to avoid
1562 * deadlocks. Then lock the kid and check for
1566 if (track->resume_ncp) {
1572 if (kid->nc_parent != ncp) {
1573 kprintf("cache_inval_internal "
1582 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1583 TAILQ_FIRST(&kid->nc_list)
1586 rcnt += _cache_inval_internal(kid,
1587 flags & ~CINV_DESTROY, track);
1588 /*_cache_unlock(kid);*/
1589 /*_cache_drop(kid);*/
1596 * Relock parent to continue scan
1601 _cache_drop(nextkid);
1608 * Someone could have gotten in there while ncp was unlocked,
1611 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1617 * Invalidate a vnode's namecache associations. To avoid races against
1618 * the resolver we do not invalidate a node which we previously invalidated
1619 * but which was then re-resolved while we were in the invalidation loop.
1621 * Returns non-zero if any namecache entries remain after the invalidation
1624 * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1625 * be ripped out of the topology while held, the vnode's v_namecache
1626 * list has no such restriction. NCP's can be ripped out of the list
1627 * at virtually any time if not locked, even if held.
1629 * In addition, the v_namecache list itself must be locked via
1630 * the vnode's spinlock.
1633 cache_inval_vp(struct vnode *vp, int flags)
1635 struct namecache *ncp;
1636 struct namecache *next;
1639 spin_lock(&vp->v_spin);
1640 ncp = TAILQ_FIRST(&vp->v_namecache);
1644 /* loop entered with ncp held and vp spin-locked */
1645 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1647 spin_unlock(&vp->v_spin);
1649 if (ncp->nc_vp != vp) {
1650 kprintf("Warning: cache_inval_vp: race-A detected on "
1651 "%s\n", ncp->nc_name);
1657 _cache_inval(ncp, flags);
1658 _cache_put(ncp); /* also releases reference */
1660 spin_lock(&vp->v_spin);
1661 if (ncp && ncp->nc_vp != vp) {
1662 spin_unlock(&vp->v_spin);
1663 kprintf("Warning: cache_inval_vp: race-B detected on "
1664 "%s\n", ncp->nc_name);
1669 spin_unlock(&vp->v_spin);
1670 return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1674 * This routine is used instead of the normal cache_inval_vp() when we
1675 * are trying to recycle otherwise good vnodes.
1677 * Return 0 on success, non-zero if not all namecache records could be
1678 * disassociated from the vnode (for various reasons).
1681 cache_inval_vp_nonblock(struct vnode *vp)
1683 struct namecache *ncp;
1684 struct namecache *next;
1686 spin_lock(&vp->v_spin);
1687 ncp = TAILQ_FIRST(&vp->v_namecache);
1691 /* loop entered with ncp held */
1692 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1694 spin_unlock(&vp->v_spin);
1695 if (_cache_lock_nonblock(ncp)) {
1701 if (ncp->nc_vp != vp) {
1702 kprintf("Warning: cache_inval_vp: race-A detected on "
1703 "%s\n", ncp->nc_name);
1709 _cache_inval(ncp, 0);
1710 _cache_put(ncp); /* also releases reference */
1712 spin_lock(&vp->v_spin);
1713 if (ncp && ncp->nc_vp != vp) {
1714 spin_unlock(&vp->v_spin);
1715 kprintf("Warning: cache_inval_vp: race-B detected on "
1716 "%s\n", ncp->nc_name);
1721 spin_unlock(&vp->v_spin);
1723 return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1727 * Clears the universal directory search 'ok' flag. This flag allows
1728 * nlookup() to bypass normal vnode checks. This flag is a cached flag
1729 * so clearing it simply forces revalidation.
1732 cache_inval_wxok(struct vnode *vp)
1734 struct namecache *ncp;
1736 spin_lock(&vp->v_spin);
1737 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1738 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX))
1739 atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX);
1741 spin_unlock(&vp->v_spin);
1745 * The source ncp has been renamed to the target ncp. Both fncp and tncp
1746 * must be locked. The target ncp is destroyed (as a normal rename-over
1747 * would destroy the target file or directory).
1749 * Because there may be references to the source ncp we cannot copy its
1750 * contents to the target. Instead the source ncp is relinked as the target
1751 * and the target ncp is removed from the namecache topology.
1754 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1756 struct namecache *fncp = fnch->ncp;
1757 struct namecache *tncp = tnch->ncp;
1758 struct namecache *tncp_par;
1759 struct nchash_head *nchpp;
1764 ++fncp->nc_generation;
1765 ++tncp->nc_generation;
1766 if (tncp->nc_nlen) {
1767 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
1768 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
1769 nname[tncp->nc_nlen] = 0;
1775 * Rename fncp (unlink)
1777 _cache_unlink_parent(fncp);
1778 oname = fncp->nc_name;
1779 fncp->nc_name = nname;
1780 fncp->nc_nlen = tncp->nc_nlen;
1782 kfree(oname, M_VFSCACHE);
1784 tncp_par = tncp->nc_parent;
1785 _cache_hold(tncp_par);
1786 _cache_lock(tncp_par);
1789 * Rename fncp (relink)
1791 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1792 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1793 nchpp = NCHHASH(hash);
1795 spin_lock(&nchpp->spin);
1796 _cache_link_parent(fncp, tncp_par, nchpp);
1797 spin_unlock(&nchpp->spin);
1799 _cache_put(tncp_par);
1802 * Get rid of the overwritten tncp (unlink)
1804 _cache_unlink(tncp);
1808 * Perform actions consistent with unlinking a file. The passed-in ncp
1811 * The ncp is marked DESTROYED so it no longer shows up in searches,
1812 * and will be physically deleted when the vnode goes away.
1814 * If the related vnode has no refs then we cycle it through vget()/vput()
1815 * to (possibly if we don't have a ref race) trigger a deactivation,
1816 * allowing the VFS to trivially detect and recycle the deleted vnode
1817 * via VOP_INACTIVE().
1819 * NOTE: _cache_rename() will automatically call _cache_unlink() on the
1823 cache_unlink(struct nchandle *nch)
1825 _cache_unlink(nch->ncp);
1829 _cache_unlink(struct namecache *ncp)
1834 * Causes lookups to fail and allows another ncp with the same
1835 * name to be created under ncp->nc_parent.
1837 ncp->nc_flag |= NCF_DESTROYED;
1838 ++ncp->nc_generation;
1841 * Attempt to trigger a deactivation. Set VREF_FINALIZE to
1842 * force action on the 1->0 transition.
1844 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1845 (vp = ncp->nc_vp) != NULL) {
1846 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
1847 if (VREFCNT(vp) <= 0) {
1848 if (vget(vp, LK_SHARED) == 0)
1855 * Return non-zero if the nch might be associated with an open and/or mmap()'d
1856 * file. The easy solution is to just return non-zero if the vnode has refs.
1857 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
1858 * force the reclaim).
1861 cache_isopen(struct nchandle *nch)
1864 struct namecache *ncp = nch->ncp;
1866 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1867 (vp = ncp->nc_vp) != NULL &&
1876 * vget the vnode associated with the namecache entry. Resolve the namecache
1877 * entry if necessary. The passed ncp must be referenced and locked. If
1878 * the ncp is resolved it might be locked shared.
1880 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked
1881 * (depending on the passed lk_type) will be returned in *vpp with an error
1882 * of 0, or NULL will be returned in *vpp with a non-0 error code. The
1883 * most typical error is ENOENT, meaning that the ncp represents a negative
1884 * cache hit and there is no vnode to retrieve, but other errors can occur
1887 * The vget() can race a reclaim. If this occurs we re-resolve the
1890 * There are numerous places in the kernel where vget() is called on a
1891 * vnode while one or more of its namecache entries is locked. Releasing
1892 * a vnode never deadlocks against locked namecache entries (the vnode
1893 * will not get recycled while referenced ncp's exist). This means we
1894 * can safely acquire the vnode. In fact, we MUST NOT release the ncp
1895 * lock when acquiring the vp lock or we might cause a deadlock.
1897 * NOTE: The passed-in ncp must be locked exclusively if it is initially
1898 * unresolved. If a reclaim race occurs the passed-in ncp will be
1899 * relocked exclusively before being re-resolved.
1902 cache_vget(struct nchandle *nch, struct ucred *cred,
1903 int lk_type, struct vnode **vpp)
1905 struct namecache *ncp;
1912 if (ncp->nc_flag & NCF_UNRESOLVED)
1913 error = cache_resolve(nch, cred);
1917 if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1918 error = vget(vp, lk_type);
1923 * The ncp may have been locked shared, we must relock
1924 * it exclusively before we can set it to unresolved.
1926 if (error == ENOENT) {
1927 kprintf("Warning: vnode reclaim race detected "
1928 "in cache_vget on %p (%s)\n",
1932 _cache_setunresolved(ncp);
1937 * Not a reclaim race, some other error.
1939 KKASSERT(ncp->nc_vp == vp);
1942 KKASSERT(ncp->nc_vp == vp);
1943 KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1946 if (error == 0 && vp == NULL)
1953 * Similar to cache_vget() but only acquires a ref on the vnode. The vnode
1954 * is already held by virtuue of the ncp being locked, but it might not be
1955 * referenced and while it is not referenced it can transition into the
1958 * NOTE: The passed-in ncp must be locked exclusively if it is initially
1959 * unresolved. If a reclaim race occurs the passed-in ncp will be
1960 * relocked exclusively before being re-resolved.
1962 * NOTE: At the moment we have to issue a vget() on the vnode, even though
1963 * we are going to immediately release the lock, in order to resolve
1964 * potential reclamation races. Once we have a solid vnode ref that
1965 * was (at some point) interlocked via a vget(), the vnode will not
1968 * NOTE: vhold counts (v_auxrefs) do not prevent reclamation.
1971 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
1973 struct namecache *ncp;
1981 if (ncp->nc_flag & NCF_UNRESOLVED)
1982 error = cache_resolve(nch, cred);
1986 while (error == 0 && (vp = ncp->nc_vp) != NULL) {
1988 * Try a lockless ref of the vnode. VRECLAIMED transitions
1989 * use the vx_lock state and update-counter mechanism so we
1990 * can detect if one is in-progress or occurred.
1992 * If we can successfully ref the vnode and interlock against
1993 * the update-counter mechanism, and VRECLAIMED is found to
1994 * not be set after that, we should be good.
1996 v = spin_access_start_only(&vp->v_spin);
1997 if (__predict_true(spin_access_check_inprog(v) == 0)) {
1999 if (__predict_false(
2000 spin_access_end_only(&vp->v_spin, v))) {
2004 if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) {
2008 kprintf("CACHE_VREF: IN-RECLAIM\n");
2012 * Do it the slow way
2014 error = vget(vp, LK_SHARED);
2019 if (error == ENOENT) {
2020 kprintf("Warning: vnode reclaim race detected "
2021 "in cache_vget on %p (%s)\n",
2025 _cache_setunresolved(ncp);
2030 * Not a reclaim race, some other error.
2032 KKASSERT(ncp->nc_vp == vp);
2035 KKASSERT(ncp->nc_vp == vp);
2036 KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2037 /* caller does not want a lock */
2042 if (error == 0 && vp == NULL)
2050 * Return a referenced vnode representing the parent directory of
2053 * Because the caller has locked the ncp it should not be possible for
2054 * the parent ncp to go away. However, the parent can unresolve its
2055 * dvp at any time so we must be able to acquire a lock on the parent
2056 * to safely access nc_vp.
2058 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2059 * so use vhold()/vdrop() while holding the lock to prevent dvp from
2060 * getting destroyed.
2062 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2063 * lock on the ncp in question..
2066 cache_dvpref(struct namecache *ncp)
2068 struct namecache *par;
2072 if ((par = ncp->nc_parent) != NULL) {
2075 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
2076 if ((dvp = par->nc_vp) != NULL)
2081 if (vget(dvp, LK_SHARED) == 0) {
2084 /* return refd, unlocked dvp */
2096 * Convert a directory vnode to a namecache record without any other
2097 * knowledge of the topology. This ONLY works with directory vnodes and
2098 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the
2099 * returned ncp (if not NULL) will be held and unlocked.
2101 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2102 * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2103 * for dvp. This will fail only if the directory has been deleted out from
2106 * Callers must always check for a NULL return no matter the value of 'makeit'.
2108 * To avoid underflowing the kernel stack each recursive call increments
2109 * the makeit variable.
2112 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2113 struct vnode *dvp, char *fakename);
2114 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2115 struct vnode **saved_dvp);
2118 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2119 struct nchandle *nch)
2121 struct vnode *saved_dvp;
2127 nch->mount = dvp->v_mount;
2132 * Handle the makeit == 0 degenerate case
2135 spin_lock_shared(&dvp->v_spin);
2136 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2139 spin_unlock_shared(&dvp->v_spin);
2143 * Loop until resolution, inside code will break out on error.
2147 * Break out if we successfully acquire a working ncp.
2149 spin_lock_shared(&dvp->v_spin);
2150 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2153 spin_unlock_shared(&dvp->v_spin);
2156 spin_unlock_shared(&dvp->v_spin);
2159 * If dvp is the root of its filesystem it should already
2160 * have a namecache pointer associated with it as a side
2161 * effect of the mount, but it may have been disassociated.
2163 if (dvp->v_flag & VROOT) {
2164 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2165 error = cache_resolve_mp(nch->mount);
2166 _cache_put(nch->ncp);
2168 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2169 dvp->v_mount, error);
2173 kprintf(" failed\n");
2178 kprintf(" succeeded\n");
2183 * If we are recursed too deeply resort to an O(n^2)
2184 * algorithm to resolve the namecache topology. The
2185 * resolved pvp is left referenced in saved_dvp to
2186 * prevent the tree from being destroyed while we loop.
2189 error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2191 kprintf("lookupdotdot(longpath) failed %d "
2192 "dvp %p\n", error, dvp);
2200 * Get the parent directory and resolve its ncp.
2203 kfree(fakename, M_TEMP);
2206 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2209 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2215 * Reuse makeit as a recursion depth counter. On success
2216 * nch will be fully referenced.
2218 cache_fromdvp(pvp, cred, makeit + 1, nch);
2220 if (nch->ncp == NULL)
2224 * Do an inefficient scan of pvp (embodied by ncp) to look
2225 * for dvp. This will create a namecache record for dvp on
2226 * success. We loop up to recheck on success.
2228 * ncp and dvp are both held but not locked.
2230 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2232 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2233 pvp, nch->ncp->nc_name, dvp);
2235 /* nch was NULLed out, reload mount */
2236 nch->mount = dvp->v_mount;
2240 kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2241 pvp, nch->ncp->nc_name);
2244 /* nch was NULLed out, reload mount */
2245 nch->mount = dvp->v_mount;
2249 * If nch->ncp is non-NULL it will have been held already.
2252 kfree(fakename, M_TEMP);
2261 * Go up the chain of parent directories until we find something
2262 * we can resolve into the namecache. This is very inefficient.
2266 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2267 struct vnode **saved_dvp)
2269 struct nchandle nch;
2272 static time_t last_fromdvp_report;
2276 * Loop getting the parent directory vnode until we get something we
2277 * can resolve in the namecache.
2280 nch.mount = dvp->v_mount;
2286 kfree(fakename, M_TEMP);
2289 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2296 spin_lock_shared(&pvp->v_spin);
2297 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2298 _cache_hold(nch.ncp);
2299 spin_unlock_shared(&pvp->v_spin);
2303 spin_unlock_shared(&pvp->v_spin);
2304 if (pvp->v_flag & VROOT) {
2305 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2306 error = cache_resolve_mp(nch.mount);
2307 _cache_unlock(nch.ncp);
2310 _cache_drop(nch.ncp);
2320 if (last_fromdvp_report != time_uptime) {
2321 last_fromdvp_report = time_uptime;
2322 kprintf("Warning: extremely inefficient path "
2323 "resolution on %s\n",
2326 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2329 * Hopefully dvp now has a namecache record associated with
2330 * it. Leave it referenced to prevent the kernel from
2331 * recycling the vnode. Otherwise extremely long directory
2332 * paths could result in endless recycling.
2337 _cache_drop(nch.ncp);
2340 kfree(fakename, M_TEMP);
2345 * Do an inefficient scan of the directory represented by ncp looking for
2346 * the directory vnode dvp. ncp must be held but not locked on entry and
2347 * will be held on return. dvp must be refd but not locked on entry and
2348 * will remain refd on return.
2350 * Why do this at all? Well, due to its stateless nature the NFS server
2351 * converts file handles directly to vnodes without necessarily going through
2352 * the namecache ops that would otherwise create the namecache topology
2353 * leading to the vnode. We could either (1) Change the namecache algorithms
2354 * to allow disconnect namecache records that are re-merged opportunistically,
2355 * or (2) Make the NFS server backtrack and scan to recover a connected
2356 * namecache topology in order to then be able to issue new API lookups.
2358 * It turns out that (1) is a huge mess. It takes a nice clean set of
2359 * namecache algorithms and introduces a lot of complication in every subsystem
2360 * that calls into the namecache to deal with the re-merge case, especially
2361 * since we are using the namecache to placehold negative lookups and the
2362 * vnode might not be immediately assigned. (2) is certainly far less
2363 * efficient then (1), but since we are only talking about directories here
2364 * (which are likely to remain cached), the case does not actually run all
2365 * that often and has the supreme advantage of not polluting the namecache
2368 * If a fakename is supplied just construct a namecache entry using the
2372 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2373 struct vnode *dvp, char *fakename)
2375 struct nlcomponent nlc;
2376 struct nchandle rncp;
2388 vat.va_blocksize = 0;
2389 if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2392 error = cache_vref(nch, cred, &pvp);
2397 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2398 "vattr fileid = %lld\n",
2399 nch->ncp, nch->ncp->nc_name,
2401 (long long)vat.va_fileid);
2405 * Use the supplied fakename if not NULL. Fake names are typically
2406 * not in the actual filesystem hierarchy. This is used by HAMMER
2407 * to glue @@timestamp recursions together.
2410 nlc.nlc_nameptr = fakename;
2411 nlc.nlc_namelen = strlen(fakename);
2412 rncp = cache_nlookup(nch, &nlc);
2416 if ((blksize = vat.va_blocksize) == 0)
2417 blksize = DEV_BSIZE;
2418 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2424 iov.iov_base = rbuf;
2425 iov.iov_len = blksize;
2428 uio.uio_resid = blksize;
2429 uio.uio_segflg = UIO_SYSSPACE;
2430 uio.uio_rw = UIO_READ;
2431 uio.uio_td = curthread;
2433 if (ncvp_debug >= 2)
2434 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2435 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2437 den = (struct dirent *)rbuf;
2438 bytes = blksize - uio.uio_resid;
2441 if (ncvp_debug >= 2) {
2442 kprintf("cache_inefficient_scan: %*.*s\n",
2443 den->d_namlen, den->d_namlen,
2446 if (den->d_type != DT_WHT &&
2447 den->d_ino == vat.va_fileid) {
2449 kprintf("cache_inefficient_scan: "
2450 "MATCHED inode %lld path %s/%*.*s\n",
2451 (long long)vat.va_fileid,
2453 den->d_namlen, den->d_namlen,
2456 nlc.nlc_nameptr = den->d_name;
2457 nlc.nlc_namelen = den->d_namlen;
2458 rncp = cache_nlookup(nch, &nlc);
2459 KKASSERT(rncp.ncp != NULL);
2462 bytes -= _DIRENT_DIRSIZ(den);
2463 den = _DIRENT_NEXT(den);
2465 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2468 kfree(rbuf, M_TEMP);
2472 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2473 _cache_setvp(rncp.mount, rncp.ncp, dvp);
2474 if (ncvp_debug >= 2) {
2475 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2476 nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2479 if (ncvp_debug >= 2) {
2480 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2481 nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2485 if (rncp.ncp->nc_vp == NULL)
2486 error = rncp.ncp->nc_error;
2488 * Release rncp after a successful nlookup. rncp was fully
2493 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2494 dvp, nch->ncp->nc_name);
2501 * This function must be called with the ncp held and locked and will unlock
2502 * and drop it during zapping.
2504 * Zap a namecache entry. The ncp is unconditionally set to an unresolved
2505 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list
2506 * and removes the related reference. If the ncp can be removed, and the
2507 * parent can be zapped non-blocking, this function loops up.
2509 * There will be one ref from the caller (which we now own). The only
2510 * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list,
2511 * so possibly 2 refs left. Taking this into account, if there are no
2512 * additional refs and no children, the ncp will be removed from the topology
2515 * References and/or children may exist if the ncp is in the middle of the
2516 * topology, preventing the ncp from being destroyed.
2518 * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2520 * This function may return a held (but NOT locked) parent node which the
2521 * caller must drop in a loop. Looping is one way to avoid unbounded recursion
2522 * due to deep namecache trees.
2524 * WARNING! For MPSAFE operation this routine must acquire up to three
2525 * spin locks to be able to safely test nc_refs. Lock order is
2528 * hash spinlock if on hash list
2529 * parent spinlock if child of parent
2530 * (the ncp is unresolved so there is no vnode association)
2533 cache_zap(struct namecache *ncp)
2535 struct namecache *par;
2536 struct vnode *dropvp;
2537 struct nchash_head *nchpp;
2539 int nonblock = 1; /* XXX cleanup */
2543 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2544 * This gets rid of any vp->v_namecache list or negative list and
2547 _cache_setunresolved(ncp);
2550 * Try to scrap the entry and possibly tail-recurse on its parent.
2551 * We only scrap unref'd (other then our ref) unresolved entries,
2552 * we do not scrap 'live' entries.
2554 * If nc_parent is non NULL we expect 2 references, else just 1.
2555 * If there are more, someone else also holds the ncp and we cannot
2558 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2559 KKASSERT(ncp->nc_refs > 0);
2562 * If the ncp is linked to its parent it will also be in the hash
2563 * table. We have to be able to lock the parent and the hash table.
2565 * Acquire locks. Note that the parent can't go away while we hold
2566 * a child locked. If nc_parent is present, expect 2 refs instead
2570 if ((par = ncp->nc_parent) != NULL) {
2572 if (_cache_lock_nonblock(par)) {
2574 ncp->nc_flag |= NCF_DEFEREDZAP;
2576 &pcpu_ncache[mycpu->gd_cpuid].numdefered,
2579 _cache_drop(ncp); /* caller's ref */
2587 nchpp = ncp->nc_head;
2588 spin_lock(&nchpp->spin);
2592 * With the parent and nchpp locked, and the vnode removed
2593 * (no vp->v_namecache), we expect 1 or 2 refs. If there are
2594 * more someone else has a ref and we cannot zap the entry.
2597 * one for our parent link (parent also has one from the linkage)
2605 * On failure undo the work we've done so far and drop the
2606 * caller's ref and ncp.
2608 if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) {
2610 spin_unlock(&nchpp->spin);
2619 * We own all the refs and with the spinlocks held no further
2620 * refs can be acquired by others.
2622 * Remove us from the hash list and parent list. We have to
2623 * drop a ref on the parent's vp if the parent's list becomes
2628 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
2630 KKASSERT(nchpp == ncp->nc_head);
2631 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
2632 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2633 atomic_add_long(&pn->vfscache_count, -1);
2634 if (TAILQ_EMPTY(&ncp->nc_list))
2635 atomic_add_long(&pn->vfscache_leafs, -1);
2637 if (TAILQ_EMPTY(&par->nc_list)) {
2638 atomic_add_long(&pn->vfscache_leafs, 1);
2640 dropvp = par->nc_vp;
2642 ncp->nc_parent = NULL;
2643 ncp->nc_head = NULL;
2644 spin_unlock(&nchpp->spin);
2645 _cache_drop(par); /* removal of ncp from par->nc_list */
2646 /*_cache_unlock(par);*/
2648 KKASSERT(ncp->nc_head == NULL);
2652 * ncp should not have picked up any refs. Physically
2655 if (ncp->nc_refs != refcmp) {
2656 panic("cache_zap: %p bad refs %d (expected %d)\n",
2657 ncp, ncp->nc_refs, refcmp);
2659 /* _cache_unlock(ncp) not required */
2660 ncp->nc_refs = -1; /* safety */
2662 kfree(ncp->nc_name, M_VFSCACHE);
2663 kfree(ncp, M_VFSCACHE);
2666 * Delayed drop (we had to release our spinlocks)
2672 * Loop up if we can recursively clean out the parent.
2675 refcmp = 1; /* ref on parent */
2676 if (par->nc_parent) /* par->par */
2678 par->nc_flag &= ~NCF_DEFEREDZAP;
2679 if ((par->nc_flag & NCF_UNRESOLVED) &&
2680 par->nc_refs == refcmp &&
2681 TAILQ_EMPTY(&par->nc_list)) {
2691 * Clean up dangling negative cache and defered-drop entries in the
2694 * This routine is called in the critical path and also called from
2695 * vnlru(). When called from vnlru we use a lower limit to try to
2696 * deal with the negative cache before the critical path has to start
2699 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2701 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2702 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2705 cache_hysteresis(int critpath)
2708 long neglimit = maxvnodes / ncnegfactor;
2709 long xnumcache = vfscache_leafs;
2712 neglimit = neglimit * 8 / 10;
2715 * Don't cache too many negative hits. We use hysteresis to reduce
2716 * the impact on the critical path.
2718 switch(neg_cache_hysteresis_state[critpath]) {
2720 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) {
2722 _cache_cleanneg(ncnegflush);
2724 _cache_cleanneg(ncnegflush +
2725 vfscache_negs - neglimit);
2726 neg_cache_hysteresis_state[critpath] = CHI_HIGH;
2730 if (vfscache_negs > MINNEG * 9 / 10 &&
2731 vfscache_negs * 9 / 10 > neglimit
2734 _cache_cleanneg(ncnegflush);
2736 _cache_cleanneg(ncnegflush +
2737 vfscache_negs * 9 / 10 -
2740 neg_cache_hysteresis_state[critpath] = CHI_LOW;
2746 * Don't cache too many positive hits. We use hysteresis to reduce
2747 * the impact on the critical path.
2749 * Excessive positive hits can accumulate due to large numbers of
2750 * hardlinks (the vnode cache will not prevent hl ncps from growing
2753 if ((poslimit = ncposlimit) == 0)
2754 poslimit = maxvnodes * 2;
2756 poslimit = poslimit * 8 / 10;
2758 switch(pos_cache_hysteresis_state[critpath]) {
2760 if (xnumcache > poslimit && xnumcache > MINPOS) {
2762 _cache_cleanpos(ncposflush);
2764 _cache_cleanpos(ncposflush +
2765 xnumcache - poslimit);
2766 pos_cache_hysteresis_state[critpath] = CHI_HIGH;
2770 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
2772 _cache_cleanpos(ncposflush);
2774 _cache_cleanpos(ncposflush +
2775 xnumcache - poslimit * 5 / 6);
2777 pos_cache_hysteresis_state[critpath] = CHI_LOW;
2783 * Clean out dangling defered-zap ncps which could not be cleanly
2784 * dropped if too many build up. Note that numdefered is
2785 * heuristical. Make sure we are real-time for the current cpu,
2786 * plus the global rollup.
2788 if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) {
2789 _cache_cleandefered();
2794 * NEW NAMECACHE LOOKUP API
2796 * Lookup an entry in the namecache. The passed par_nch must be referenced
2797 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp
2798 * is ALWAYS returned, eve if the supplied component is illegal.
2800 * The resulting namecache entry should be returned to the system with
2801 * cache_put() or cache_unlock() + cache_drop().
2803 * namecache locks are recursive but care must be taken to avoid lock order
2804 * reversals (hence why the passed par_nch must be unlocked). Locking
2805 * rules are to order for parent traversals, not for child traversals.
2807 * Nobody else will be able to manipulate the associated namespace (e.g.
2808 * create, delete, rename, rename-target) until the caller unlocks the
2811 * The returned entry will be in one of three states: positive hit (non-null
2812 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2813 * Unresolved entries must be resolved through the filesystem to associate the
2814 * vnode and/or determine whether a positive or negative hit has occured.
2816 * It is not necessary to lock a directory in order to lock namespace under
2817 * that directory. In fact, it is explicitly not allowed to do that. A
2818 * directory is typically only locked when being created, renamed, or
2821 * The directory (par) may be unresolved, in which case any returned child
2822 * will likely also be marked unresolved. Likely but not guarenteed. Since
2823 * the filesystem lookup requires a resolved directory vnode the caller is
2824 * responsible for resolving the namecache chain top-down. This API
2825 * specifically allows whole chains to be created in an unresolved state.
2828 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
2830 struct nchandle nch;
2831 struct namecache *ncp;
2832 struct namecache *new_ncp;
2833 struct namecache *rep_ncp; /* reuse a destroyed ncp */
2834 struct nchash_head *nchpp;
2841 mp = par_nch->mount;
2845 * This is a good time to call it, no ncp's are locked by
2848 cache_hysteresis(1);
2851 * Try to locate an existing entry
2853 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2854 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2856 nchpp = NCHHASH(hash);
2860 spin_lock(&nchpp->spin);
2862 spin_lock_shared(&nchpp->spin);
2864 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
2866 * Break out if we find a matching entry. Note that
2867 * UNRESOLVED entries may match, but DESTROYED entries
2870 * We may be able to reuse DESTROYED entries that we come
2871 * across, even if the name does not match, as long as
2872 * nc_nlen is correct and the only hold ref is from the nchpp
2875 if (ncp->nc_parent == par_nch->ncp &&
2876 ncp->nc_nlen == nlc->nlc_namelen) {
2877 if (ncp->nc_flag & NCF_DESTROYED) {
2878 if (ncp->nc_refs == 1 && rep_ncp == NULL)
2882 if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen))
2886 spin_unlock(&nchpp->spin);
2888 spin_unlock_shared(&nchpp->spin);
2890 _cache_unlock(par_nch->ncp);
2893 if (_cache_lock_special(ncp) == 0) {
2895 * Successfully locked but we must re-test
2896 * conditions that might have changed since
2897 * we did not have the lock before.
2899 if (ncp->nc_parent != par_nch->ncp ||
2900 ncp->nc_nlen != nlc->nlc_namelen ||
2901 bcmp(ncp->nc_name, nlc->nlc_nameptr,
2903 (ncp->nc_flag & NCF_DESTROYED)) {
2907 _cache_auto_unresolve(mp, ncp);
2909 _cache_free(new_ncp);
2912 _cache_get(ncp); /* cycle the lock to block */
2920 * We failed to locate the entry, try to resurrect a destroyed
2921 * entry that we did find that is already correctly linked into
2922 * nchpp and the parent. We must re-test conditions after
2923 * successfully locking rep_ncp.
2925 * This case can occur under heavy loads due to not being able
2926 * to safely lock the parent in cache_zap(). Nominally a repeated
2927 * create/unlink load, but only the namelen needs to match.
2929 if (rep_ncp && new_ncp == NULL) {
2930 if (_cache_lock_nonblock(rep_ncp) == 0) {
2931 _cache_hold(rep_ncp);
2932 if (rep_ncp->nc_parent == par_nch->ncp &&
2933 rep_ncp->nc_nlen == nlc->nlc_namelen &&
2934 (rep_ncp->nc_flag & NCF_DESTROYED) &&
2935 rep_ncp->nc_refs == 2) {
2937 * Update nc_name as reuse as new.
2940 bcopy(nlc->nlc_nameptr, ncp->nc_name,
2942 spin_unlock_shared(&nchpp->spin);
2943 _cache_setunresolved(ncp);
2944 ncp->nc_flag = NCF_UNRESOLVED;
2945 ncp->nc_error = ENOTCONN;
2948 _cache_put(rep_ncp);
2953 * Otherwise create a new entry and add it to the cache. The parent
2954 * ncp must also be locked so we can link into it.
2956 * We have to relookup after possibly blocking in kmalloc or
2957 * when locking par_nch.
2959 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2960 * mount case, in which case nc_name will be NULL.
2962 if (new_ncp == NULL) {
2963 spin_unlock_shared(&nchpp->spin);
2964 new_ncp = cache_alloc(nlc->nlc_namelen);
2965 if (nlc->nlc_namelen) {
2966 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2968 new_ncp->nc_name[nlc->nlc_namelen] = 0;
2974 * NOTE! The spinlock is held exclusively here because new_ncp
2977 if (par_locked == 0) {
2978 spin_unlock(&nchpp->spin);
2979 _cache_lock(par_nch->ncp);
2985 * Link to parent (requires another ref, the one already in new_ncp
2986 * is what we wil lreturn).
2988 * WARNING! We still hold the spinlock. We have to set the hash
2989 * table entry atomically.
2993 _cache_link_parent(ncp, par_nch->ncp, nchpp);
2994 spin_unlock(&nchpp->spin);
2995 _cache_unlock(par_nch->ncp);
2996 /* par_locked = 0 - not used */
2999 * stats and namecache size management
3001 if (ncp->nc_flag & NCF_UNRESOLVED)
3002 ++gd->gd_nchstats->ncs_miss;
3003 else if (ncp->nc_vp)
3004 ++gd->gd_nchstats->ncs_goodhits;
3006 ++gd->gd_nchstats->ncs_neghits;
3009 _cache_mntref(nch.mount);
3015 * Attempt to lookup a namecache entry and return with a shared namecache
3016 * lock. This operates non-blocking. EWOULDBLOCK is returned if excl is
3017 * set or we are unable to lock.
3020 cache_nlookup_maybe_shared(struct nchandle *par_nch,
3021 struct nlcomponent *nlc,
3022 int excl, struct nchandle *res_nch)
3024 struct namecache *ncp;
3025 struct nchash_head *nchpp;
3031 * If exclusive requested or shared namecache locks are disabled,
3034 if (ncp_shared_lock_disable || excl)
3035 return(EWOULDBLOCK);
3038 mp = par_nch->mount;
3041 * This is a good time to call it, no ncp's are locked by
3044 cache_hysteresis(1);
3047 * Try to locate an existing entry
3049 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3050 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3051 nchpp = NCHHASH(hash);
3053 spin_lock_shared(&nchpp->spin);
3055 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3057 * Break out if we find a matching entry. Note that
3058 * UNRESOLVED entries may match, but DESTROYED entries
3061 if (ncp->nc_parent == par_nch->ncp &&
3062 ncp->nc_nlen == nlc->nlc_namelen &&
3063 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3064 (ncp->nc_flag & NCF_DESTROYED) == 0
3067 spin_unlock_shared(&nchpp->spin);
3069 if (_cache_lock_shared_special(ncp) == 0) {
3070 if (ncp->nc_parent == par_nch->ncp &&
3071 ncp->nc_nlen == nlc->nlc_namelen &&
3072 bcmp(ncp->nc_name, nlc->nlc_nameptr,
3073 ncp->nc_nlen) == 0 &&
3074 (ncp->nc_flag & NCF_DESTROYED) == 0 &&
3075 (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
3076 _cache_auto_unresolve_test(mp, ncp) == 0) {
3082 return(EWOULDBLOCK);
3089 spin_unlock_shared(&nchpp->spin);
3090 return(EWOULDBLOCK);
3095 * Note that nc_error might be non-zero (e.g ENOENT).
3098 res_nch->mount = mp;
3100 ++gd->gd_nchstats->ncs_goodhits;
3101 _cache_mntref(res_nch->mount);
3103 KKASSERT(ncp->nc_error != EWOULDBLOCK);
3104 return(ncp->nc_error);
3108 * This is a non-blocking verison of cache_nlookup() used by
3109 * nfs_readdirplusrpc_uio(). It can fail for any reason and
3110 * will return nch.ncp == NULL in that case.
3113 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
3115 struct nchandle nch;
3116 struct namecache *ncp;
3117 struct namecache *new_ncp;
3118 struct nchash_head *nchpp;
3125 mp = par_nch->mount;
3129 * Try to locate an existing entry
3131 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3132 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3134 nchpp = NCHHASH(hash);
3136 spin_lock(&nchpp->spin);
3137 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3139 * Break out if we find a matching entry. Note that
3140 * UNRESOLVED entries may match, but DESTROYED entries
3143 if (ncp->nc_parent == par_nch->ncp &&
3144 ncp->nc_nlen == nlc->nlc_namelen &&
3145 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3146 (ncp->nc_flag & NCF_DESTROYED) == 0
3149 spin_unlock(&nchpp->spin);
3151 _cache_unlock(par_nch->ncp);
3154 if (_cache_lock_special(ncp) == 0) {
3155 if (ncp->nc_parent != par_nch->ncp ||
3156 ncp->nc_nlen != nlc->nlc_namelen ||
3157 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) ||
3158 (ncp->nc_flag & NCF_DESTROYED)) {
3159 kprintf("cache_lookup_nonblock: "
3160 "ncp-race %p %*.*s\n",
3169 _cache_auto_unresolve(mp, ncp);
3171 _cache_free(new_ncp);
3182 * We failed to locate an entry, create a new entry and add it to
3183 * the cache. The parent ncp must also be locked so we
3186 * We have to relookup after possibly blocking in kmalloc or
3187 * when locking par_nch.
3189 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3190 * mount case, in which case nc_name will be NULL.
3192 if (new_ncp == NULL) {
3193 spin_unlock(&nchpp->spin);
3194 new_ncp = cache_alloc(nlc->nlc_namelen);
3195 if (nlc->nlc_namelen) {
3196 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3198 new_ncp->nc_name[nlc->nlc_namelen] = 0;
3202 if (par_locked == 0) {
3203 spin_unlock(&nchpp->spin);
3204 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3212 * Link to parent (requires another ref, the one already in new_ncp
3213 * is what we wil lreturn).
3215 * WARNING! We still hold the spinlock. We have to set the hash
3216 * table entry atomically.
3220 _cache_link_parent(ncp, par_nch->ncp, nchpp);
3221 spin_unlock(&nchpp->spin);
3222 _cache_unlock(par_nch->ncp);
3223 /* par_locked = 0 - not used */
3226 * stats and namecache size management
3228 if (ncp->nc_flag & NCF_UNRESOLVED)
3229 ++gd->gd_nchstats->ncs_miss;
3230 else if (ncp->nc_vp)
3231 ++gd->gd_nchstats->ncs_goodhits;
3233 ++gd->gd_nchstats->ncs_neghits;
3236 _cache_mntref(nch.mount);
3241 _cache_free(new_ncp);
3250 * This version is non-locking. The caller must validate the result
3251 * for parent-to-child continuity.
3253 * It can fail for any reason and will return nch.ncp == NULL in that case.
3256 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc)
3258 struct nchandle nch;
3259 struct namecache *ncp;
3260 struct nchash_head *nchpp;
3266 mp = par_nch->mount;
3269 * Try to locate an existing entry
3271 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3272 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3273 nchpp = NCHHASH(hash);
3275 spin_lock_shared(&nchpp->spin);
3276 TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3278 * Break out if we find a matching entry. Note that
3279 * UNRESOLVED entries may match, but DESTROYED entries
3282 * Resolved NFS entries which have timed out fail so the
3283 * caller can rerun with normal locking.
3285 if (ncp->nc_parent == par_nch->ncp &&
3286 ncp->nc_nlen == nlc->nlc_namelen &&
3287 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3288 (ncp->nc_flag & NCF_DESTROYED) == 0
3290 if (_cache_auto_unresolve_test(par_nch->mount, ncp))
3293 spin_unlock_shared(&nchpp->spin);
3297 spin_unlock_shared(&nchpp->spin);
3303 * stats and namecache size management
3305 if (ncp->nc_flag & NCF_UNRESOLVED)
3306 ++gd->gd_nchstats->ncs_miss;
3307 else if (ncp->nc_vp)
3308 ++gd->gd_nchstats->ncs_goodhits;
3310 ++gd->gd_nchstats->ncs_neghits;
3313 _cache_mntref(nch.mount);
3319 * The namecache entry is marked as being used as a mount point.
3320 * Locate the mount if it is visible to the caller. The DragonFly
3321 * mount system allows arbitrary loops in the topology and disentangles
3322 * those loops by matching against (mp, ncp) rather than just (ncp).
3323 * This means any given ncp can dive any number of mounts, depending
3324 * on the relative mount (e.g. nullfs) the caller is at in the topology.
3326 * We use a very simple frontend cache to reduce SMP conflicts,
3327 * which we have to do because the mountlist scan needs an exclusive
3328 * lock around its ripout info list. Not to mention that there might
3329 * be a lot of mounts.
3331 * Because all mounts can potentially be accessed by all cpus, break the cpu's
3332 * down a bit to allow some contention rather than making the cache
3335 * The hash table is split into per-cpu areas, is 4-way set-associative.
3337 struct findmount_info {
3338 struct mount *result;
3339 struct mount *nch_mount;
3340 struct namecache *nch_ncp;
3344 struct ncmount_cache *
3345 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp)
3349 hash = iscsi_crc32(&mp, sizeof(mp));
3350 hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash);
3352 hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1));
3354 return (&ncmount_cache[hash]);
3358 struct ncmount_cache *
3359 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3361 struct ncmount_cache *ncc;
3362 struct ncmount_cache *best;
3367 ncc = ncmount_cache_lookup4(mp, ncp);
3370 * NOTE: When checking for a ticks overflow implement a slop of
3371 * 2 ticks just to be safe, because ticks is accessed
3372 * non-atomically one CPU can increment it while another
3373 * is still using the old value.
3375 if (ncc->ncp == ncp && ncc->mp == mp) /* 0 */
3377 delta = (int)(ticks - ncc->ticks); /* beware GCC opts */
3378 if (delta < -2) /* overflow reset */
3383 for (i = 1; i < NCMOUNT_SET; ++i) { /* 1, 2, 3 */
3385 if (ncc->ncp == ncp && ncc->mp == mp)
3387 delta = (int)(ticks - ncc->ticks);
3390 if (delta > best_delta) {
3399 * pcpu-optimized mount search. Locate the recursive mountpoint, avoid
3400 * doing an expensive mountlist_scan*() if possible.
3402 * (mp, ncp) -> mountonpt.k
3404 * Returns a referenced mount pointer or NULL
3406 * General SMP operation uses a per-cpu umount_spin to interlock unmount
3407 * operations (that is, where the mp_target can be freed out from under us).
3409 * Lookups use the ncc->updating counter to validate the contents in order
3410 * to avoid having to obtain the per cache-element spin-lock. In addition,
3411 * the ticks field is only updated when it changes. However, if our per-cpu
3412 * lock fails due to an unmount-in-progress, we fall-back to the
3413 * cache-element's spin-lock.
3416 cache_findmount(struct nchandle *nch)
3418 struct findmount_info info;
3419 struct ncmount_cache *ncc;
3420 struct ncmount_cache ncc_copy;
3421 struct mount *target;
3422 struct pcpu_ncache *pcpu;
3423 struct spinlock *spinlk;
3427 if (ncmount_cache_enable == 0 || pcpu == NULL) {
3431 pcpu += mycpu->gd_cpuid;
3434 ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3435 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3438 * This is a bit messy for now because we do not yet have
3439 * safe disposal of mount structures. We have to ref
3440 * ncc->mp_target but the 'update' counter only tell us
3441 * whether the cache has changed after the fact.
3443 * For now get a per-cpu spinlock that will only contend
3444 * against umount's. This is the best path. If it fails,
3445 * instead of waiting on the umount we fall-back to a
3446 * shared ncc->spin lock, which will generally only cost a
3449 update = ncc->updating;
3450 if (__predict_true(spin_trylock(&pcpu->umount_spin))) {
3451 spinlk = &pcpu->umount_spin;
3453 spinlk = &ncc->spin;
3454 spin_lock_shared(spinlk);
3456 if (update & 1) { /* update in progress */
3457 spin_unlock_any(spinlk);
3462 if (ncc->updating != update) { /* content changed */
3463 spin_unlock_any(spinlk);
3466 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) {
3467 spin_unlock_any(spinlk);
3470 if (ncc_copy.isneg == 0) {
3471 target = ncc_copy.mp_target;
3472 if (target->mnt_ncmounton.mount == nch->mount &&
3473 target->mnt_ncmounton.ncp == nch->ncp) {
3475 * Cache hit (positive) (avoid dirtying
3476 * the cache line if possible)
3478 if (ncc->ticks != (int)ticks)
3479 ncc->ticks = (int)ticks;
3480 _cache_mntref(target);
3484 * Cache hit (negative) (avoid dirtying
3485 * the cache line if possible)
3487 if (ncc->ticks != (int)ticks)
3488 ncc->ticks = (int)ticks;
3491 spin_unlock_any(spinlk);
3501 info.nch_mount = nch->mount;
3502 info.nch_ncp = nch->ncp;
3503 mountlist_scan(cache_findmount_callback, &info,
3504 MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK);
3507 * To reduce multi-re-entry on the cache, relookup in the cache.
3508 * This can still race, obviously, but that's ok.
3510 ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3511 if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3513 atomic_add_int(&info.result->mnt_refs, -1);
3520 if ((info.result == NULL ||
3521 (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) {
3522 spin_lock(&ncc->spin);
3523 atomic_add_int_nonlocked(&ncc->updating, 1);
3525 KKASSERT(ncc->updating & 1);
3526 if (ncc->mp != nch->mount) {
3528 atomic_add_int(&ncc->mp->mnt_refs, -1);
3529 atomic_add_int(&nch->mount->mnt_refs, 1);
3530 ncc->mp = nch->mount;
3532 ncc->ncp = nch->ncp; /* ptr compares only, not refd*/
3533 ncc->ticks = (int)ticks;
3537 if (ncc->mp_target != info.result) {
3539 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3540 ncc->mp_target = info.result;
3541 atomic_add_int(&info.result->mnt_refs, 1);
3545 if (ncc->mp_target) {
3546 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3547 ncc->mp_target = NULL;
3551 atomic_add_int_nonlocked(&ncc->updating, 1);
3552 spin_unlock(&ncc->spin);
3554 return(info.result);
3559 cache_findmount_callback(struct mount *mp, void *data)
3561 struct findmount_info *info = data;
3564 * Check the mount's mounted-on point against the passed nch.
3566 if (mp->mnt_ncmounton.mount == info->nch_mount &&
3567 mp->mnt_ncmounton.ncp == info->nch_ncp
3577 cache_dropmount(struct mount *mp)
3583 * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive
3586 * A full scan is not required, but for now just do it anyway.
3589 cache_ismounting(struct mount *mp)
3591 struct ncmount_cache *ncc;
3592 struct mount *ncc_mp;
3595 if (pcpu_ncache == NULL)
3598 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
3599 ncc = &ncmount_cache[i];
3600 if (ncc->mp != mp->mnt_ncmounton.mount ||
3601 ncc->ncp != mp->mnt_ncmounton.ncp) {
3604 spin_lock(&ncc->spin);
3605 atomic_add_int_nonlocked(&ncc->updating, 1);
3607 KKASSERT(ncc->updating & 1);
3608 if (ncc->mp != mp->mnt_ncmounton.mount ||
3609 ncc->ncp != mp->mnt_ncmounton.ncp) {
3612 spin_unlock(&ncc->spin);
3619 atomic_add_int(&ncc_mp->mnt_refs, -1);
3620 ncc_mp = ncc->mp_target;
3621 ncc->mp_target = NULL;
3623 atomic_add_int(&ncc_mp->mnt_refs, -1);
3624 ncc->ticks = (int)ticks - hz * 120;
3627 atomic_add_int_nonlocked(&ncc->updating, 1);
3628 spin_unlock(&ncc->spin);
3632 * Pre-cache the mount point
3634 ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount,
3635 mp->mnt_ncmounton.ncp);
3637 spin_lock(&ncc->spin);
3638 atomic_add_int_nonlocked(&ncc->updating, 1);
3640 KKASSERT(ncc->updating & 1);
3643 atomic_add_int(&ncc->mp->mnt_refs, -1);
3644 atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1);
3645 ncc->mp = mp->mnt_ncmounton.mount;
3646 ncc->ncp = mp->mnt_ncmounton.ncp; /* ptr compares only */
3647 ncc->ticks = (int)ticks;
3650 if (ncc->mp_target != mp) {
3652 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3653 ncc->mp_target = mp;
3654 atomic_add_int(&mp->mnt_refs, 1);
3657 atomic_add_int_nonlocked(&ncc->updating, 1);
3658 spin_unlock(&ncc->spin);
3662 * Scrap any ncmount_cache entries related to mp. Not only do we need to
3663 * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any
3664 * negative hits involving (mp, <any>).
3666 * A full scan is required.
3669 cache_unmounting(struct mount *mp)
3671 struct ncmount_cache *ncc;
3672 struct pcpu_ncache *pcpu;
3673 struct mount *ncc_mp;
3680 for (i = 0; i < ncpus; ++i)
3681 spin_lock(&pcpu[i].umount_spin);
3683 for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
3684 ncc = &ncmount_cache[i];
3685 if (ncc->mp != mp && ncc->mp_target != mp)
3687 spin_lock(&ncc->spin);
3688 atomic_add_int_nonlocked(&ncc->updating, 1);
3691 if (ncc->mp != mp && ncc->mp_target != mp) {
3692 atomic_add_int_nonlocked(&ncc->updating, 1);
3694 spin_unlock(&ncc->spin);
3701 atomic_add_int(&ncc_mp->mnt_refs, -1);
3702 ncc_mp = ncc->mp_target;
3703 ncc->mp_target = NULL;
3705 atomic_add_int(&ncc_mp->mnt_refs, -1);
3706 ncc->ticks = (int)ticks - hz * 120;
3709 atomic_add_int_nonlocked(&ncc->updating, 1);
3710 spin_unlock(&ncc->spin);
3713 for (i = 0; i < ncpus; ++i)
3714 spin_unlock(&pcpu[i].umount_spin);
3718 * Resolve an unresolved namecache entry, generally by looking it up.
3719 * The passed ncp must be locked and refd.
3721 * Theoretically since a vnode cannot be recycled while held, and since
3722 * the nc_parent chain holds its vnode as long as children exist, the
3723 * direct parent of the cache entry we are trying to resolve should
3724 * have a valid vnode. If not then generate an error that we can
3725 * determine is related to a resolver bug.
3727 * However, if a vnode was in the middle of a recyclement when the NCP
3728 * got locked, ncp->nc_vp might point to a vnode that is about to become
3729 * invalid. cache_resolve() handles this case by unresolving the entry
3730 * and then re-resolving it.
3732 * Note that successful resolution does not necessarily return an error
3733 * code of 0. If the ncp resolves to a negative cache hit then ENOENT
3737 cache_resolve(struct nchandle *nch, struct ucred *cred)
3739 struct namecache *par_tmp;
3740 struct namecache *par;
3741 struct namecache *ncp;
3742 struct nchandle nctmp;
3749 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
3752 * If the ncp is already resolved we have nothing to do. However,
3753 * we do want to guarentee that a usable vnode is returned when
3754 * a vnode is present, so make sure it hasn't been reclaimed.
3756 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3757 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3758 _cache_setunresolved(ncp);
3759 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
3760 return (ncp->nc_error);
3764 * If the ncp was destroyed it will never resolve again. This
3765 * can basically only happen when someone is chdir'd into an
3766 * empty directory which is then rmdir'd. We want to catch this
3767 * here and not dive the VFS because the VFS might actually
3768 * have a way to re-resolve the disconnected ncp, which will
3769 * result in inconsistencies in the cdir/nch for proc->p_fd.
3771 if (ncp->nc_flag & NCF_DESTROYED)
3775 * Mount points need special handling because the parent does not
3776 * belong to the same filesystem as the ncp.
3778 if (ncp == mp->mnt_ncmountpt.ncp)
3779 return (cache_resolve_mp(mp));
3782 * We expect an unbroken chain of ncps to at least the mount point,
3783 * and even all the way to root (but this code doesn't have to go
3784 * past the mount point).
3786 if (ncp->nc_parent == NULL) {
3787 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
3788 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3789 ncp->nc_error = EXDEV;
3790 return(ncp->nc_error);
3794 * The vp's of the parent directories in the chain are held via vhold()
3795 * due to the existance of the child, and should not disappear.
3796 * However, there are cases where they can disappear:
3798 * - due to filesystem I/O errors.
3799 * - due to NFS being stupid about tracking the namespace and
3800 * destroys the namespace for entire directories quite often.
3801 * - due to forced unmounts.
3802 * - due to an rmdir (parent will be marked DESTROYED)
3804 * When this occurs we have to track the chain backwards and resolve
3805 * it, looping until the resolver catches up to the current node. We
3806 * could recurse here but we might run ourselves out of kernel stack
3807 * so we do it in a more painful manner. This situation really should
3808 * not occur all that often, or if it does not have to go back too
3809 * many nodes to resolve the ncp.
3811 while ((dvp = cache_dvpref(ncp)) == NULL) {
3813 * This case can occur if a process is CD'd into a
3814 * directory which is then rmdir'd. If the parent is marked
3815 * destroyed there is no point trying to resolve it.
3817 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
3819 par = ncp->nc_parent;
3822 while ((par_tmp = par->nc_parent) != NULL &&
3823 par_tmp->nc_vp == NULL) {
3824 _cache_hold(par_tmp);
3825 _cache_lock(par_tmp);
3829 if (par->nc_parent == NULL) {
3830 kprintf("EXDEV case 2 %*.*s\n",
3831 par->nc_nlen, par->nc_nlen, par->nc_name);
3836 * The parent is not set in stone, ref and lock it to prevent
3837 * it from disappearing. Also note that due to renames it
3838 * is possible for our ncp to move and for par to no longer
3839 * be one of its parents. We resolve it anyway, the loop
3840 * will handle any moves.
3842 _cache_get(par); /* additional hold/lock */
3843 _cache_put(par); /* from earlier hold/lock */
3844 if (par == nch->mount->mnt_ncmountpt.ncp) {
3845 cache_resolve_mp(nch->mount);
3846 } else if ((dvp = cache_dvpref(par)) == NULL) {
3847 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n",
3848 par->nc_nlen, par->nc_nlen, par->nc_name);
3852 if (par->nc_flag & NCF_UNRESOLVED) {
3855 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3859 if ((error = par->nc_error) != 0) {
3860 if (par->nc_error != EAGAIN) {
3861 kprintf("EXDEV case 3 %*.*s error %d\n",
3862 par->nc_nlen, par->nc_nlen, par->nc_name,
3867 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3868 par, par->nc_nlen, par->nc_nlen, par->nc_name);
3875 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3876 * ncp's and reattach them. If this occurs the original ncp is marked
3877 * EAGAIN to force a relookup.
3879 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3880 * ncp must already be resolved.
3885 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3888 ncp->nc_error = EPERM;
3890 if (ncp->nc_error == EAGAIN) {
3891 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3892 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3895 return(ncp->nc_error);
3899 * Resolve the ncp associated with a mount point. Such ncp's almost always
3900 * remain resolved and this routine is rarely called. NFS MPs tends to force
3901 * re-resolution more often due to its mac-truck-smash-the-namecache
3902 * method of tracking namespace changes.
3904 * The semantics for this call is that the passed ncp must be locked on
3905 * entry and will be locked on return. However, if we actually have to
3906 * resolve the mount point we temporarily unlock the entry in order to
3907 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of
3908 * the unlock we have to recheck the flags after we relock.
3911 cache_resolve_mp(struct mount *mp)
3913 struct namecache *ncp = mp->mnt_ncmountpt.ncp;
3917 KKASSERT(mp != NULL);
3920 * If the ncp is already resolved we have nothing to do. However,
3921 * we do want to guarentee that a usable vnode is returned when
3922 * a vnode is present, so make sure it hasn't been reclaimed.
3924 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3925 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3926 _cache_setunresolved(ncp);
3929 if (ncp->nc_flag & NCF_UNRESOLVED) {
3931 while (vfs_busy(mp, 0))
3933 error = VFS_ROOT(mp, &vp);
3937 * recheck the ncp state after relocking.
3939 if (ncp->nc_flag & NCF_UNRESOLVED) {
3940 ncp->nc_error = error;
3942 _cache_setvp(mp, ncp, vp);
3945 kprintf("[diagnostic] cache_resolve_mp: failed"
3946 " to resolve mount %p err=%d ncp=%p\n",
3948 _cache_setvp(mp, ncp, NULL);
3950 } else if (error == 0) {
3955 return(ncp->nc_error);
3959 * Clean out negative cache entries when too many have accumulated.
3962 _cache_cleanneg(long count)
3964 struct pcpu_ncache *pn;
3965 struct namecache *ncp;
3966 static uint32_t neg_rover;
3970 n = neg_rover++; /* SMP heuristical, race ok */
3972 n = n % (uint32_t)ncpus;
3975 * Normalize vfscache_negs and count. count is sometimes based
3976 * on vfscache_negs. vfscache_negs is heuristical and can sometimes
3977 * have crazy values.
3979 vnegs = vfscache_negs;
3981 if (vnegs <= MINNEG)
3986 pn = &pcpu_ncache[n];
3987 spin_lock(&pn->neg_spin);
3988 count = pn->neg_count * count / vnegs + 1;
3989 spin_unlock(&pn->neg_spin);
3992 * Attempt to clean out the specified number of negative cache
3996 spin_lock(&pn->neg_spin);
3997 ncp = TAILQ_FIRST(&pn->neg_list);
3999 spin_unlock(&pn->neg_spin);
4002 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
4003 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
4005 spin_unlock(&pn->neg_spin);
4008 * This can race, so we must re-check that the ncp
4009 * is on the ncneg.list after successfully locking it.
4011 if (_cache_lock_special(ncp) == 0) {
4012 if (ncp->nc_vp == NULL &&
4013 (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4027 * Clean out positive cache entries when too many have accumulated.
4030 _cache_cleanpos(long count)
4032 static volatile int rover;
4033 struct nchash_head *nchpp;
4034 struct namecache *ncp;
4038 * Attempt to clean out the specified number of negative cache
4042 rover_copy = ++rover; /* MPSAFEENOUGH */
4044 nchpp = NCHHASH(rover_copy);
4046 if (TAILQ_FIRST(&nchpp->list) == NULL) {
4052 * Cycle ncp on list, ignore and do not move DUMMY
4053 * ncps. These are temporary list iterators.
4055 * We must cycle the ncp to the end of the list to
4056 * ensure that all ncp's have an equal chance of
4059 spin_lock(&nchpp->spin);
4060 ncp = TAILQ_FIRST(&nchpp->list);
4061 while (ncp && (ncp->nc_flag & NCF_DUMMY))
4062 ncp = TAILQ_NEXT(ncp, nc_hash);
4064 TAILQ_REMOVE(&nchpp->list, ncp, nc_hash);
4065 TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash);
4068 spin_unlock(&nchpp->spin);
4071 if (_cache_lock_special(ncp) == 0) {
4082 * This is a kitchen sink function to clean out ncps which we
4083 * tried to zap from cache_drop() but failed because we were
4084 * unable to acquire the parent lock.
4086 * Such entries can also be removed via cache_inval_vp(), such
4087 * as when unmounting.
4090 _cache_cleandefered(void)
4092 struct nchash_head *nchpp;
4093 struct namecache *ncp;
4094 struct namecache dummy;
4098 * Create a list iterator. DUMMY indicates that this is a list
4099 * iterator, DESTROYED prevents matches by lookup functions.
4102 pcpu_ncache[mycpu->gd_cpuid].numdefered = 0;
4103 bzero(&dummy, sizeof(dummy));
4104 dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY;
4107 for (i = 0; i <= nchash; ++i) {
4108 nchpp = &nchashtbl[i];
4110 spin_lock(&nchpp->spin);
4111 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
4113 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) {
4114 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
4116 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4117 TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash);
4119 spin_unlock(&nchpp->spin);
4120 if (_cache_lock_nonblock(ncp) == 0) {
4121 ncp->nc_flag &= ~NCF_DEFEREDZAP;
4125 spin_lock(&nchpp->spin);
4128 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4129 spin_unlock(&nchpp->spin);
4134 * Name cache initialization, from vfsinit() when we are booting
4139 struct pcpu_ncache *pn;
4144 * Per-cpu accounting and negative hit list
4146 pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus,
4147 M_VFSCACHE, M_WAITOK|M_ZERO);
4148 for (i = 0; i < ncpus; ++i) {
4149 pn = &pcpu_ncache[i];
4150 TAILQ_INIT(&pn->neg_list);
4151 spin_init(&pn->neg_spin, "ncneg");
4152 spin_init(&pn->umount_spin, "ncumm");
4156 * Initialise per-cpu namecache effectiveness statistics.
4158 for (i = 0; i < ncpus; ++i) {
4159 gd = globaldata_find(i);
4160 gd->gd_nchstats = &nchstats[i];
4164 * Create a generous namecache hash table
4166 nchashtbl = hashinit_ext(vfs_inodehashsize(),
4167 sizeof(struct nchash_head),
4168 M_VFSCACHE, &nchash);
4169 for (i = 0; i <= (int)nchash; ++i) {
4170 TAILQ_INIT(&nchashtbl[i].list);
4171 spin_init(&nchashtbl[i].spin, "nchinit_hash");
4173 for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
4174 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
4175 nclockwarn = 5 * hz;
4179 * Called from start_init() to bootstrap the root filesystem. Returns
4180 * a referenced, unlocked namecache record.
4183 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
4185 nch->ncp = cache_alloc(0);
4189 _cache_setvp(nch->mount, nch->ncp, vp);
4193 * vfs_cache_setroot()
4195 * Create an association between the root of our namecache and
4196 * the root vnode. This routine may be called several times during
4199 * If the caller intends to save the returned namecache pointer somewhere
4200 * it must cache_hold() it.
4203 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
4206 struct nchandle onch;
4214 cache_zero(&rootnch);
4222 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache
4223 * topology and is being removed as quickly as possible. The new VOP_N*()
4224 * API calls are required to make specific adjustments using the supplied
4225 * ncp pointers rather then just bogusly purging random vnodes.
4227 * Invalidate all namecache entries to a particular vnode as well as
4228 * any direct children of that vnode in the namecache. This is a
4229 * 'catch all' purge used by filesystems that do not know any better.
4231 * Note that the linkage between the vnode and its namecache entries will
4232 * be removed, but the namecache entries themselves might stay put due to
4233 * active references from elsewhere in the system or due to the existance of
4234 * the children. The namecache topology is left intact even if we do not
4235 * know what the vnode association is. Such entries will be marked
4239 cache_purge(struct vnode *vp)
4241 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
4244 static int disablecwd;
4245 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
4248 static u_long numcwdcalls;
4249 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
4250 "Number of current directory resolution calls");
4251 static u_long numcwdfailnf;
4252 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
4253 "Number of current directory failures due to lack of file");
4254 static u_long numcwdfailsz;
4255 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
4256 "Number of current directory failures due to large result");
4257 static u_long numcwdfound;
4258 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
4259 "Number of current directory resolution successes");
4265 sys___getcwd(struct __getcwd_args *uap)
4275 buflen = uap->buflen;
4278 if (buflen > MAXPATHLEN)
4279 buflen = MAXPATHLEN;
4281 buf = kmalloc(buflen, M_TEMP, M_WAITOK);
4282 bp = kern_getcwd(buf, buflen, &error);
4284 error = copyout(bp, uap->buf, strlen(bp) + 1);
4290 kern_getcwd(char *buf, size_t buflen, int *error)
4292 struct proc *p = curproc;
4294 int i, slash_prefixed;
4295 struct filedesc *fdp;
4296 struct nchandle nch;
4297 struct namecache *ncp;
4306 nch = fdp->fd_ncdir;
4311 while (ncp && (ncp != fdp->fd_nrdir.ncp ||
4312 nch.mount != fdp->fd_nrdir.mount)
4315 * While traversing upwards if we encounter the root
4316 * of the current mount we have to skip to the mount point
4317 * in the underlying filesystem.
4319 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
4320 nch = nch.mount->mnt_ncmounton;
4329 * Prepend the path segment
4331 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4338 *--bp = ncp->nc_name[i];
4350 * Go up a directory. This isn't a mount point so we don't
4351 * have to check again.
4353 while ((nch.ncp = ncp->nc_parent) != NULL) {
4354 if (ncp_shared_lock_disable)
4357 _cache_lock_shared(ncp);
4358 if (nch.ncp != ncp->nc_parent) {
4362 _cache_hold(nch.ncp);
4375 if (!slash_prefixed) {
4393 * Thus begins the fullpath magic.
4395 * The passed nchp is referenced but not locked.
4397 static int disablefullpath;
4398 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
4399 &disablefullpath, 0,
4400 "Disable fullpath lookups");
4403 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
4404 char **retbuf, char **freebuf, int guess)
4406 struct nchandle fd_nrdir;
4407 struct nchandle nch;
4408 struct namecache *ncp;
4409 struct mount *mp, *new_mp;
4418 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
4419 bp = buf + MAXPATHLEN - 1;
4422 fd_nrdir = *nchbase;
4424 fd_nrdir = p->p_fd->fd_nrdir;
4434 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
4438 * If we are asked to guess the upwards path, we do so whenever
4439 * we encounter an ncp marked as a mountpoint. We try to find
4440 * the actual mountpoint by finding the mountpoint with this
4443 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
4444 new_mp = mount_get_by_nc(ncp);
4447 * While traversing upwards if we encounter the root
4448 * of the current mount we have to skip to the mount point.
4450 if (ncp == mp->mnt_ncmountpt.ncp) {
4454 nch = new_mp->mnt_ncmounton;
4464 * Prepend the path segment
4466 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4472 *--bp = ncp->nc_name[i];
4483 * Go up a directory. This isn't a mount point so we don't
4484 * have to check again.
4486 * We can only safely access nc_parent with ncp held locked.
4488 while ((nch.ncp = ncp->nc_parent) != NULL) {
4489 _cache_lock_shared(ncp);
4490 if (nch.ncp != ncp->nc_parent) {
4494 _cache_hold(nch.ncp);
4507 if (!slash_prefixed) {
4525 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
4526 char **freebuf, int guess)
4528 struct namecache *ncp;
4529 struct nchandle nch;
4533 if (disablefullpath)
4539 /* vn is NULL, client wants us to use p->p_textvp */
4541 if ((vn = p->p_textvp) == NULL)
4544 spin_lock_shared(&vn->v_spin);
4545 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
4550 spin_unlock_shared(&vn->v_spin);
4554 spin_unlock_shared(&vn->v_spin);
4557 nch.mount = vn->v_mount;
4558 error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
4564 vfscache_rollup_cpu(struct globaldata *gd)
4566 struct pcpu_ncache *pn;
4569 if (pcpu_ncache == NULL)
4571 pn = &pcpu_ncache[gd->gd_cpuid];
4573 if (pn->vfscache_count) {
4574 count = atomic_swap_long(&pn->vfscache_count, 0);
4575 atomic_add_long(&vfscache_count, count);
4577 if (pn->vfscache_leafs) {
4578 count = atomic_swap_long(&pn->vfscache_leafs, 0);
4579 atomic_add_long(&vfscache_leafs, count);
4581 if (pn->vfscache_negs) {
4582 count = atomic_swap_long(&pn->vfscache_negs, 0);
4583 atomic_add_long(&vfscache_negs, count);
4585 if (pn->numdefered) {
4586 count = atomic_swap_long(&pn->numdefered, 0);
4587 atomic_add_long(&numdefered, count);