2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * Copyright (c) 1989, 1993, 1995
35 * The Regents of the University of California. All rights reserved.
37 * This code is derived from software contributed to Berkeley by
38 * Poul-Henning Kamp of the FreeBSD Project.
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 * 3. All advertising materials mentioning features or use of this software
49 * must display the following acknowledgement:
50 * This product includes software developed by the University of
51 * California, Berkeley and its contributors.
52 * 4. Neither the name of the University nor the names of its contributors
53 * may be used to endorse or promote products derived from this software
54 * without specific prior written permission.
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
69 * $FreeBSD: src/sys/kern/vfs_cache.c,v 1.42.2.6 2001/10/05 20:07:03 dillon Exp $
70 * $DragonFly: src/sys/kern/vfs_cache.c,v 1.31 2004/10/02 03:18:26 dillon Exp $
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/mount.h>
78 #include <sys/vnode.h>
79 #include <sys/malloc.h>
80 #include <sys/sysproto.h>
82 #include <sys/namei.h>
83 #include <sys/nlookup.h>
84 #include <sys/filedesc.h>
85 #include <sys/fnv_hash.h>
86 #include <sys/globaldata.h>
87 #include <sys/kern_syscall.h>
90 * Random lookups in the cache are accomplished with a hash table using
91 * a hash key of (nc_src_vp, name).
93 * Negative entries may exist and correspond to structures where nc_vp
94 * is NULL. In a negative entry, NCF_WHITEOUT will be set if the entry
95 * corresponds to a whited-out directory entry (verses simply not finding the
98 * Upon reaching the last segment of a path, if the reference is for DELETE,
99 * or NOCACHE is set (rewrite), and the name is located in the cache, it
104 * Structures associated with name cacheing.
106 #define NCHHASH(hash) (&nchashtbl[(hash) & nchash])
109 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
111 static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
112 static struct namecache_list ncneglist; /* instead of vnode */
114 static u_long nchash; /* size of hash table */
115 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
117 static u_long ncnegfactor = 16; /* ratio of negative entries */
118 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
120 static u_long numneg; /* number of cache entries allocated */
121 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
123 static u_long numcache; /* number of cache entries allocated */
124 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
126 static u_long numunres; /* number of unresolved entries */
127 SYSCTL_ULONG(_debug, OID_AUTO, numunres, CTLFLAG_RD, &numunres, 0, "");
129 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
130 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
132 static int cache_resolve_mp(struct namecache *ncp);
135 * The new name cache statistics
137 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
138 #define STATNODE(mode, name, var) \
139 SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
140 STATNODE(CTLFLAG_RD, numneg, &numneg);
141 STATNODE(CTLFLAG_RD, numcache, &numcache);
142 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
143 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
144 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
145 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
146 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
147 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
148 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
149 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
150 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
151 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
153 struct nchstats nchstats[SMP_MAXCPU];
155 * Export VFS cache effectiveness statistics to user-land.
157 * The statistics are left for aggregation to user-land so
158 * neat things can be achieved, like observing per-CPU cache
162 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
164 struct globaldata *gd;
168 for (i = 0; i < ncpus; ++i) {
169 gd = globaldata_find(i);
170 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
171 sizeof(struct nchstats))))
177 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
178 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
180 static void cache_zap(struct namecache *ncp);
183 * cache_hold() and cache_drop() prevent the premature deletion of a
184 * namecache entry but do not prevent operations (such as zapping) on
185 * that namecache entry.
189 _cache_hold(struct namecache *ncp)
197 _cache_drop(struct namecache *ncp)
199 KKASSERT(ncp->nc_refs > 0);
200 if (ncp->nc_refs == 1 &&
201 (ncp->nc_flag & NCF_UNRESOLVED) &&
202 TAILQ_EMPTY(&ncp->nc_list)
211 * Link a new namecache entry to its parent. Be careful to avoid races
212 * if vhold() blocks in the future.
215 cache_link_parent(struct namecache *ncp, struct namecache *par)
217 KKASSERT(ncp->nc_parent == NULL);
218 ncp->nc_parent = par;
219 if (TAILQ_EMPTY(&par->nc_list)) {
220 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
222 * Any vp associated with an ncp which has children must
223 * be held to prevent it from being recycled.
228 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
233 * Remove the parent association from a namecache structure.
236 cache_unlink_parent(struct namecache *ncp)
238 struct namecache *par;
240 if ((par = ncp->nc_parent) != NULL) {
241 ncp->nc_parent = NULL;
242 par = cache_hold(par);
243 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
244 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
251 * Allocate a new namecache structure.
253 static struct namecache *
256 struct namecache *ncp;
258 ncp = malloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
259 ncp->nc_flag = NCF_UNRESOLVED;
260 ncp->nc_error = ENOTCONN; /* needs to be resolved */
261 TAILQ_INIT(&ncp->nc_list);
267 * Ref and deref a namecache structure.
270 cache_hold(struct namecache *ncp)
272 return(_cache_hold(ncp));
276 cache_drop(struct namecache *ncp)
282 * Namespace locking. The caller must already hold a reference to the
283 * namecache structure in order to lock/unlock it. This function prevents
284 * the namespace from being created or destroyed by accessors other then
287 * Note that holding a locked namecache structure prevents other threads
288 * from making namespace changes (e.g. deleting or creating), prevents
289 * vnode association state changes by other threads, and prevents the
290 * namecache entry from being resolved or unresolved by other threads.
292 * The lock owner has full authority to associate/disassociate vnodes
293 * and resolve/unresolve the locked ncp.
295 * In particular, if a vnode is associated with a locked cache entry
296 * that vnode will *NOT* be recycled. We accomplish this by vhold()ing the
297 * vnode. XXX we should find a more efficient way to prevent the vnode
298 * from being recycled, but remember that any given vnode may have multiple
299 * namecache associations (think hardlinks).
302 cache_lock(struct namecache *ncp)
307 KKASSERT(ncp->nc_refs != 0);
312 if (ncp->nc_exlocks == 0) {
316 * The vp associated with a locked ncp must be held
317 * to prevent it from being recycled (which would
318 * cause the ncp to become unresolved).
320 * XXX loop on race for later MPSAFE work.
326 if (ncp->nc_locktd == td) {
330 ncp->nc_flag |= NCF_LOCKREQ;
331 if (tsleep(ncp, 0, "clock", hz) == EWOULDBLOCK) {
334 printf("[diagnostic] cache_lock: blocked on %*.*s\n",
335 ncp->nc_nlen, ncp->nc_nlen,
342 printf("[diagnostic] cache_lock: unblocked %*.*s\n",
343 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
348 cache_unlock(struct namecache *ncp)
350 thread_t td = curthread;
352 KKASSERT(ncp->nc_refs > 0);
353 KKASSERT(ncp->nc_exlocks > 0);
354 KKASSERT(ncp->nc_locktd == td);
355 if (--ncp->nc_exlocks == 0) {
358 ncp->nc_locktd = NULL;
359 if (ncp->nc_flag & NCF_LOCKREQ) {
360 ncp->nc_flag &= ~NCF_LOCKREQ;
367 * ref-and-lock, unlock-and-deref functions.
370 cache_get(struct namecache *ncp)
378 cache_put(struct namecache *ncp)
385 * Resolve an unresolved ncp by associating a vnode with it. If the
386 * vnode is NULL, a negative cache entry is created.
388 * The ncp should be locked on entry and will remain locked on return.
391 cache_setvp(struct namecache *ncp, struct vnode *vp)
393 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
397 * Any vp associated with an ncp which has children must
398 * be held. Any vp associated with a locked ncp must be held.
400 if (!TAILQ_EMPTY(&ncp->nc_list))
402 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
407 * Set auxillary flags
411 ncp->nc_flag |= NCF_ISDIR;
414 ncp->nc_flag |= NCF_ISSYMLINK;
415 /* XXX cache the contents of the symlink */
423 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
425 ncp->nc_error = ENOENT;
427 ncp->nc_flag &= ~NCF_UNRESOLVED;
431 * Disassociate the vnode or negative-cache association and mark a
432 * namecache entry as unresolved again. Note that the ncp is still
433 * left in the hash table and still linked to its parent.
435 * The ncp should be locked on entry and will remain locked on return.
438 cache_setunresolved(struct namecache *ncp)
442 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
443 ncp->nc_flag |= NCF_UNRESOLVED;
444 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
445 ncp->nc_error = ENOTCONN;
447 if ((vp = ncp->nc_vp) != NULL) {
449 ncp->nc_vp = NULL; /* safety */
450 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
453 * Any vp associated with an ncp with children is
454 * held by that ncp. Any vp associated with a locked
455 * ncp is held by that ncp. These conditions must be
456 * undone when the vp is cleared out from the ncp.
458 if (!TAILQ_EMPTY(&ncp->nc_list))
463 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
470 * vget the vnode associated with the namecache entry. Resolve the namecache
471 * entry if necessary and deal with namecache/vp races. The passed ncp must
472 * be referenced and may be locked. The ncp's ref/locking state is not
473 * effected by this call.
475 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked
476 * (depending on the passed lk_type) will be returned in *vpp with an error
477 * of 0, or NULL will be returned in *vpp with a non-0 error code. The
478 * most typical error is ENOENT, meaning that the ncp represents a negative
479 * cache hit and there is no vnode to retrieve, but other errors can occur
482 * The main race we have to deal with are namecache zaps. The ncp itself
483 * will not disappear since it is referenced, and it turns out that the
484 * validity of the vp pointer can be checked simply by rechecking the
485 * contents of ncp->nc_vp.
488 cache_vget(struct namecache *ncp, struct ucred *cred,
489 int lk_type, struct vnode **vpp)
496 if (ncp->nc_flag & NCF_UNRESOLVED) {
498 error = cache_resolve(ncp, cred);
503 if (error == 0 && (vp = ncp->nc_vp) != NULL) {
504 error = vget(vp, NULL, lk_type, curthread);
506 if (vp != ncp->nc_vp) /* handle cache_zap race */
509 } else if (vp != ncp->nc_vp) { /* handle cache_zap race */
514 if (error == 0 && vp == NULL)
521 cache_vref(struct namecache *ncp, struct ucred *cred, struct vnode **vpp)
528 if (ncp->nc_flag & NCF_UNRESOLVED) {
530 error = cache_resolve(ncp, cred);
535 if (error == 0 && (vp = ncp->nc_vp) != NULL) {
537 if (vp != ncp->nc_vp) { /* handle cache_zap race */
542 if (error == 0 && vp == NULL)
549 * Try to destroy a namecache entry. The entry is disassociated from its
550 * vnode or ncneglist and reverted to an UNRESOLVED state.
552 * Then, if there are no additional references to the ncp and we can
553 * successfully delete the children, the entry is also removed from the
554 * namecache hashlist / topology.
556 * References or undeletable children will prevent the entry from being
557 * removed from the topology. The entry may be revalidated (typically
558 * by cache_enter()) at a later time. Children remain because:
560 * + we have tried to delete a node rather then a leaf in the topology.
561 * + the presence of negative entries (we try to scrap these).
562 * + an entry or child has a non-zero ref count and cannot be scrapped.
564 * This function must be called with the ncp held and will drop the ref
565 * count during zapping.
568 cache_zap(struct namecache *ncp)
570 struct namecache *par;
573 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
575 cache_setunresolved(ncp);
578 * Try to scrap the entry and possibly tail-recurse on its parent.
579 * We only scrap unref'd (other then our ref) unresolved entries,
580 * we do not scrap 'live' entries.
582 while (ncp->nc_flag & NCF_UNRESOLVED) {
584 * Someone other then us has a ref, stop.
586 if (ncp->nc_refs > 1)
590 * We have children, stop.
592 if (!TAILQ_EMPTY(&ncp->nc_list))
595 if (ncp->nc_flag & NCF_HASHED) {
596 ncp->nc_flag &= ~NCF_HASHED;
597 LIST_REMOVE(ncp, nc_hash);
601 * Unlink from its parent and free, then loop on the
602 * parent. XXX temp hack, in stage-3 parent is never NULL
604 if ((par = ncp->nc_parent) != NULL) {
605 par = cache_hold(par);
606 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
607 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
611 ncp->nc_refs = -1; /* safety */
612 ncp->nc_parent = NULL; /* safety */
614 free(ncp->nc_name, M_VFSCACHE);
615 free(ncp, M_VFSCACHE);
617 if (par == NULL) /* temp hack */
618 return; /* temp hack */
625 * NEW NAMECACHE LOOKUP API
627 * Lookup an entry in the cache. A locked, referenced, non-NULL
628 * entry is *always* returned, even if the supplied component is illegal.
629 * The returned namecache entry should be returned to the system with
630 * cache_put() or cache_unlock() + cache_drop().
632 * namecache locks are recursive but care must be taken to avoid lock order
635 * Nobody else will be able to manipulate the associated namespace (e.g.
636 * create, delete, rename, rename-target) until the caller unlocks the
639 * The returned entry will be in one of three states: positive hit (non-null
640 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
641 * Unresolved entries must be resolved through the filesystem to associate the
642 * vnode and/or determine whether a positive or negative hit has occured.
644 * It is not necessary to lock a directory in order to lock namespace under
645 * that directory. In fact, it is explicitly not allowed to do that. A
646 * directory is typically only locked when being created, renamed, or
649 * The directory (par) may be unresolved, in which case any returned child
650 * will likely also be marked unresolved. Likely but not guarenteed. Since
651 * the filesystem VOP_NEWLOOKUP() requires a resolved directory vnode the
652 * caller is responsible for resolving the namecache chain top-down. This API
653 * specifically allows whole chains to be created in an unresolved state.
656 cache_nlookup(struct namecache *par, struct nlcomponent *nlc)
658 struct namecache *ncp;
659 struct namecache *new_ncp;
660 struct nchashhead *nchpp;
668 * Try to locate an existing entry
670 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
671 hash = fnv_32_buf(&par, sizeof(par), hash);
674 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
678 * Zap entries that have timed out.
680 if (ncp->nc_timeout &&
681 (int)(ncp->nc_timeout - ticks) < 0
683 cache_zap(cache_hold(ncp));
688 * Break out if we find a matching entry. Note that
689 * UNRESOLVED entries may match.
691 if (ncp->nc_parent == par &&
692 ncp->nc_nlen == nlc->nlc_namelen &&
693 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0
696 free(new_ncp->nc_name, M_VFSCACHE);
697 free(new_ncp, M_VFSCACHE);
704 * We failed to locate an entry, create a new entry and add it to
705 * the cache. We have to relookup after possibly blocking in
708 if (new_ncp == NULL) {
709 new_ncp = cache_alloc();
710 new_ncp->nc_name = malloc(nlc->nlc_namelen,
711 M_VFSCACHE, M_WAITOK);
718 * Initialize as a new UNRESOLVED entry, lock (non-blocking),
719 * and link to the parent.
721 ncp->nc_nlen = nlc->nlc_namelen;
722 bcopy(nlc->nlc_nameptr, ncp->nc_name, nlc->nlc_namelen);
723 nchpp = NCHHASH(hash);
724 LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
725 ncp->nc_flag |= NCF_HASHED;
727 cache_link_parent(ncp, par);
731 * Entry found. Cleanup any dangling new_ncp, ref and lock
740 * Resolve an unresolved namecache entry, generally by looking it up.
741 * The passed ncp must be locked.
743 * Theoretically since a vnode cannot be recycled while held, and since
744 * the nc_parent chain holds its vnode as long as children exist, the
745 * direct parent of the cache entry we are trying to resolve should
746 * have a valid vnode. If not then generate an error that we can
747 * determine is related to a resolver bug.
750 cache_resolve(struct namecache *ncp, struct ucred *cred)
752 struct namecache *par;
755 * Mount points need special handling because the parent does not
756 * belong to the same filesystem as the ncp.
758 if (ncp->nc_flag & NCF_MOUNTPT) {
759 return (cache_resolve_mp(ncp));
763 * We expect an unbroken chain of ncps to at least the mount point,
764 * and even all the way to root (but this code doesn't have to go
765 * past the mount point).
767 if (ncp->nc_parent == NULL) {
768 printf("EXDEV case 1 %*.*s\n",
769 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
770 ncp->nc_error = EXDEV;
771 return(ncp->nc_error);
775 * The vp's of the parent directories in the chain are held via vhold()
776 * due to the existance of the child, and should not disappear.
777 * However, there are cases where they can disappear:
779 * - due to filesystem I/O errors.
780 * - due to NFS being stupid about tracking the namespace and
781 * destroys the namespace for entire directories quite often.
782 * - due to forced unmounts.
784 * When this occurs we have to track the chain backwards and resolve
785 * it, looping until the resolver catches up to the current node. We
786 * could recurse here but we might run ourselves out of kernel stack
787 * so we do it in a more painful manner. This situation really should
788 * not occur all that often, or if it does not have to go back too
789 * many nodes to resolve the ncp.
791 while (ncp->nc_parent->nc_vp == NULL) {
792 par = ncp->nc_parent;
793 while (par->nc_parent && par->nc_parent->nc_vp == NULL)
794 par = par->nc_parent;
795 if (par->nc_parent == NULL) {
796 printf("EXDEV case 2 %*.*s\n",
797 par->nc_nlen, par->nc_nlen, par->nc_name);
800 printf("[diagnostic] cache_resolve: had to recurse on %*.*s\n",
801 par->nc_nlen, par->nc_nlen, par->nc_name);
803 * The leaf prevents the parent from going away, but a
804 * separate ref is still required to lock it. Use cache_get()
805 * instead of cache_lock().
808 if (par->nc_flag & NCF_MOUNTPT) {
809 cache_resolve_mp(par);
812 vop_resolve(par->nc_parent->nc_vp->v_ops, par, cred);
816 printf("EXDEV case 3 %*.*s error %d\n",
817 par->nc_nlen, par->nc_nlen, par->nc_name,
819 return(par->nc_error);
822 if (ncp->nc_flag & NCF_MOUNTPT) {
823 cache_resolve_mp(ncp);
826 vop_resolve(ncp->nc_parent->nc_vp->v_ops, ncp, cred);
828 return(ncp->nc_error);
832 * Resolve the ncp associated with a mount point. Such ncp's almost always
833 * remain resolved and this routine is rarely called. NFS MPs tends to force
834 * re-resolution more often due to its mac-truck-smash-the-namecache
835 * method of tracking namespace changes.
837 * The passed ncp must be locked.
840 cache_resolve_mp(struct namecache *ncp)
843 struct mount *mp = ncp->nc_mount;
845 KKASSERT(mp != NULL);
846 if (ncp->nc_flag & NCF_UNRESOLVED) {
847 while (vfs_busy(mp, 0, NULL, curthread))
849 ncp->nc_error = VFS_ROOT(mp, &vp);
850 if (ncp->nc_error == 0) {
851 cache_setvp(ncp, vp);
854 printf("[diagnostic] cache_resolve_mp: failed to resolve mount %p\n", mp);
855 cache_setvp(ncp, NULL);
857 vfs_unbusy(mp, curthread);
859 return(ncp->nc_error);
863 * Lookup an entry in the cache.
865 * XXX OLD API ROUTINE! WHEN ALL VFSs HAVE BEEN CLEANED UP THIS PROCEDURE
868 * Lookup is called with dvp pointing to the directory to search,
869 * cnp pointing to the name of the entry being sought.
871 * If the lookup succeeds, the vnode is returned in *vpp, and a
872 * status of -1 is returned.
874 * If the lookup determines that the name does not exist (negative cacheing),
875 * a status of ENOENT is returned.
877 * If the lookup fails, a status of zero is returned.
879 * Matching UNRESOLVED entries are resolved.
881 * HACKS: we create dummy nodes for parents
884 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
886 struct namecache *ncp;
887 struct namecache *par;
888 struct namecache *bpar;
890 globaldata_t gd = mycpu;
895 * Obtain the namecache entry associated with dvp, creating one if
896 * necessary. If we have to create one we have insufficient
897 * information to hash it or even supply the name, but we still
898 * need one so we can link it in.
900 * NOTE: in this stage of development, the passed 'par' is
901 * almost always NULL.
903 while ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL) {
905 if (TAILQ_FIRST(&dvp->v_namecache) != NULL)
906 free(par, M_VFSCACHE);
908 cache_setvp(par, dvp); /* XXX par not locked */
912 * Deal with "." and "..". In this stage of code development we leave
913 * the returned ncpp NULL. Note that if the namecache is disjoint,
914 * we won't find a vnode for "..".
916 if (cnp->cn_nameptr[0] == '.') {
917 if (cnp->cn_namelen == 1) {
920 numposhits++; /* include in total statistics */
923 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
925 numposhits++; /* include in total statistics */
926 if ((cnp->cn_flags & CNP_MAKEENTRY) == 0)
928 if (par->nc_parent == NULL ||
929 par->nc_parent->nc_vp == NULL) {
932 *vpp = par->nc_parent->nc_vp;
938 * Try to locate an existing entry
940 hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
942 hash = fnv_32_buf(&bpar, sizeof(bpar), hash);
944 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
948 * Zap entries that have timed out.
950 if (ncp->nc_timeout &&
951 (int)(ncp->nc_timeout - ticks) < 0
953 cache_zap(cache_hold(ncp));
958 * Break out if we find a matching entry.
960 if (ncp->nc_parent == par &&
961 ncp->nc_nlen == cnp->cn_namelen &&
962 bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen) == 0
970 * We found an entry but it is unresolved, act the same as if we
971 * failed to locate the entry. cache_enter() will do the right
974 if (ncp && (ncp->nc_flag & NCF_UNRESOLVED)) {
980 * If we failed to locate an entry, return 0 (indicates failure).
983 if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
988 gd->gd_nchstats->ncs_miss++;
993 * If we found an entry, but we don't want to have one, we zap it.
995 if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
997 gd->gd_nchstats->ncs_badhits++;
1003 * If the vnode is not NULL then return the positive match.
1007 gd->gd_nchstats->ncs_goodhits++;
1014 * If the vnode is NULL we found a negative match. If we want to
1015 * create it, purge the negative match and return failure (as if
1016 * we hadn't found a match in the first place).
1018 if (cnp->cn_nameiop == NAMEI_CREATE) {
1020 gd->gd_nchstats->ncs_badhits++;
1028 * We found a "negative" match, ENOENT notifies client of this match.
1029 * The nc_flag field records whether this is a whiteout. Since there
1030 * is no vnode we can use the vnode tailq link field with ncneglist.
1032 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
1033 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
1034 gd->gd_nchstats->ncs_neghits++;
1035 if (ncp->nc_flag & NCF_WHITEOUT)
1036 cnp->cn_flags |= CNP_ISWHITEOUT;
1042 * Add an entry to the cache. (OLD API)
1044 * XXX OLD API ROUTINE! WHEN ALL VFSs HAVE BEEN CLEANED UP THIS PROCEDURE
1048 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
1050 struct namecache *par;
1051 struct namecache *ncp;
1052 struct namecache *new_ncp;
1053 struct namecache *bpar;
1054 struct nchashhead *nchpp;
1058 * If the directory has no namecache entry we must associate one with
1059 * it. The name of the entry is not known so it isn't hashed. This
1060 * is a severe hack to support the old API.
1062 while ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL) {
1063 par = cache_alloc();
1064 if (TAILQ_FIRST(&dvp->v_namecache) != NULL)
1065 free(par, M_VFSCACHE);
1067 cache_setvp(par, dvp);
1072 * This may be a bit confusing. "." and ".." are 'virtual' entries.
1073 * We do not actually create a namecache entry representing either.
1074 * However, the ".." case is used to linkup a potentially disjoint
1075 * directory with its parent, to disconnect a directory from its
1076 * parent, or to change an existing linkage that may no longer be
1077 * correct (as might occur when a subdirectory is renamed).
1080 if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
1084 if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' &&
1085 cnp->cn_nameptr[1] == '.'
1089 cache_unlink_parent(par);
1091 while ((ncp = TAILQ_FIRST(&vp->v_namecache)) == NULL) {
1092 ncp = cache_alloc();
1093 if (TAILQ_FIRST(&vp->v_namecache) != NULL)
1094 free(ncp, M_VFSCACHE);
1096 cache_setvp(ncp, vp);
1099 * ncp is the new parent of par
1103 cache_unlink_parent(par);
1104 cache_link_parent(par, ncp);
1112 * Locate other entries associated with this vnode and zap them,
1113 * because the purge code may not be able to find them due to
1114 * the topology not yet being consistent. This is a hack (this
1115 * whole routine is a hack, actually, so that makes this a hack
1120 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1121 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1122 cache_zap(cache_hold(ncp));
1129 * Try to find a match in the hash table, allocate a new entry if
1130 * we can't. We have to retry the loop after any potential blocking
1133 hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
1135 hash = fnv_32_buf(&bpar, sizeof(bpar), hash);
1139 LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1143 * Break out if we find a matching entry.
1145 if (ncp->nc_parent == par &&
1146 ncp->nc_nlen == cnp->cn_namelen &&
1147 bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen) == 0
1154 if (new_ncp == NULL) {
1155 new_ncp = cache_alloc();
1156 new_ncp->nc_name = malloc(cnp->cn_namelen,
1157 M_VFSCACHE, M_WAITOK);
1162 ncp->nc_nlen = cnp->cn_namelen;
1163 bcopy(cnp->cn_nameptr, ncp->nc_name, cnp->cn_namelen);
1164 nchpp = NCHHASH(hash);
1165 LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
1166 ncp->nc_flag |= NCF_HASHED;
1167 cache_link_parent(ncp, par);
1168 } else if (new_ncp) {
1169 free(new_ncp->nc_name, M_VFSCACHE);
1170 free(new_ncp, M_VFSCACHE);
1173 cache_setunresolved(ncp);
1174 cache_setvp(ncp, vp);
1179 if (cnp->cn_flags & CNP_CACHETIMEOUT) {
1180 if ((ncp->nc_timeout = ticks + cnp->cn_timeout) == 0)
1181 ncp->nc_timeout = 1;
1185 * If the target vnode is NULL if this is to be a negative cache
1189 ncp->nc_flag &= ~NCF_WHITEOUT;
1190 if (cnp->cn_flags & CNP_ISWHITEOUT)
1191 ncp->nc_flag |= NCF_WHITEOUT;
1196 * Don't cache too many negative hits
1198 if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
1199 ncp = TAILQ_FIRST(&ncneglist);
1200 KKASSERT(ncp != NULL);
1201 cache_zap(cache_hold(ncp));
1206 * Name cache initialization, from vfsinit() when we are booting
1214 /* initialise per-cpu namecache effectiveness statistics. */
1215 for (i = 0; i < ncpus; ++i) {
1216 gd = globaldata_find(i);
1217 gd->gd_nchstats = &nchstats[i];
1220 TAILQ_INIT(&ncneglist);
1221 nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash);
1225 * Called from start_init() to bootstrap the root filesystem. Returns
1226 * a referenced, unlocked namecache record.
1229 cache_allocroot(struct vnode *vp)
1231 struct namecache *ncp = cache_alloc();
1233 cache_setvp(ncp, vp);
1234 ncp->nc_flag |= NCF_MOUNTPT | NCF_ROOT;
1235 return(cache_hold(ncp));
1239 * vfs_cache_setroot()
1241 * Create an association between the root of our namecache and
1242 * the root vnode. This routine may be called several times during
1245 * If the caller intends to save the returned namecache pointer somewhere
1246 * it must cache_hold() it.
1249 vfs_cache_setroot(struct vnode *nvp, struct namecache *ncp)
1252 struct namecache *oncp;
1266 * Invalidate all namecache entries to a particular vnode as well as
1267 * any direct children of that vnode in the namecache. This is a
1268 * 'catch all' purge used by filesystems that do not know any better.
1270 * A new vnode v_id is generated. Note that no vnode will ever have a
1273 * Note that the linkage between the vnode and its namecache entries will
1274 * be removed, but the namecache entries themselves might stay put due to
1275 * active references from elsewhere in the system or due to the existance of
1276 * the children. The namecache topology is left intact even if we do not
1277 * know what the vnode association is. Such entries will be marked
1280 * XXX: Only time and the size of v_id prevents this from failing:
1281 * XXX: In theory we should hunt down all (struct vnode*, v_id)
1282 * XXX: soft references and nuke them, at least on the global
1283 * XXX: v_id wraparound. The period of resistance can be extended
1284 * XXX: by incrementing each vnodes v_id individually instead of
1285 * XXX: using the global v_id.
1288 cache_purge(struct vnode *vp)
1290 static u_long nextid;
1291 struct namecache *ncp;
1292 struct namecache *scan;
1295 * Disassociate the vnode from its namecache entries along with
1296 * (for historical reasons) any direct children.
1298 while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) {
1301 restart: /* YYY hack, fix me */
1302 TAILQ_FOREACH(scan, &ncp->nc_list, nc_entry) {
1303 if ((scan->nc_flag & NCF_UNRESOLVED) == 0) {
1304 cache_zap(cache_hold(scan));
1312 * Calculate a new unique id for ".." handling
1316 } while (nextid == vp->v_id || nextid == 0);
1321 * Flush all entries referencing a particular filesystem.
1323 * Since we need to check it anyway, we will flush all the invalid
1324 * entries at the same time.
1327 cache_purgevfs(struct mount *mp)
1329 struct nchashhead *nchpp;
1330 struct namecache *ncp, *nnp;
1333 * Scan hash tables for applicable entries.
1335 for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
1336 ncp = LIST_FIRST(nchpp);
1340 nnp = LIST_NEXT(ncp, nc_hash);
1343 if (ncp->nc_vp && ncp->nc_vp->v_mount == mp)
1355 * Test whether the vnode is at a leaf in the nameicache tree.
1357 * Returns 0 if it is a leaf, -1 if it isn't.
1360 cache_leaf_test(struct vnode *vp)
1362 struct namecache *scan;
1363 struct namecache *ncp;
1365 TAILQ_FOREACH(scan, &vp->v_namecache, nc_vnode) {
1366 TAILQ_FOREACH(ncp, &scan->nc_list, nc_entry) {
1367 /* YYY && ncp->nc_vp->v_type == VDIR ? */
1368 if (ncp->nc_vp != NULL)
1376 * Perform canonical checks and cache lookup and pass on to filesystem
1377 * through the vop_cachedlookup only if needed.
1380 * struct vnode a_dvp;
1381 * struct vnode **a_vpp;
1382 * struct componentname *a_cnp;
1386 vfs_cache_lookup(struct vop_lookup_args *ap)
1388 struct vnode *dvp, *vp;
1391 struct vnode **vpp = ap->a_vpp;
1392 struct componentname *cnp = ap->a_cnp;
1393 struct ucred *cred = cnp->cn_cred;
1394 int flags = cnp->cn_flags;
1395 struct thread *td = cnp->cn_td;
1396 u_long vpid; /* capability number of vnode */
1400 lockparent = flags & CNP_LOCKPARENT;
1402 if (dvp->v_type != VDIR)
1405 if ((flags & CNP_ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
1406 (cnp->cn_nameiop == NAMEI_DELETE || cnp->cn_nameiop == NAMEI_RENAME)) {
1410 error = VOP_ACCESS(dvp, VEXEC, cred, td);
1415 error = cache_lookup(dvp, vpp, cnp);
1418 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
1420 if (error == ENOENT)
1425 cnp->cn_flags &= ~CNP_PDIRUNLOCK;
1426 if (dvp == vp) { /* lookup on "." */
1429 } else if (flags & CNP_ISDOTDOT) {
1430 VOP_UNLOCK(dvp, NULL, 0, td);
1431 cnp->cn_flags |= CNP_PDIRUNLOCK;
1432 error = vget(vp, NULL, LK_EXCLUSIVE, td);
1433 if (!error && lockparent && (flags & CNP_ISLASTCN)) {
1434 if ((error = vn_lock(dvp, NULL, LK_EXCLUSIVE, td)) == 0)
1435 cnp->cn_flags &= ~CNP_PDIRUNLOCK;
1438 error = vget(vp, NULL, LK_EXCLUSIVE, td);
1439 if (!lockparent || error || !(flags & CNP_ISLASTCN)) {
1440 VOP_UNLOCK(dvp, NULL, 0, td);
1441 cnp->cn_flags |= CNP_PDIRUNLOCK;
1445 * Check that the capability number did not change
1446 * while we were waiting for the lock.
1449 if (vpid == vp->v_id)
1452 if (lockparent && dvp != vp && (flags & CNP_ISLASTCN)) {
1453 VOP_UNLOCK(dvp, NULL, 0, td);
1454 cnp->cn_flags |= CNP_PDIRUNLOCK;
1457 if (cnp->cn_flags & CNP_PDIRUNLOCK) {
1458 error = vn_lock(dvp, NULL, LK_EXCLUSIVE, td);
1461 cnp->cn_flags &= ~CNP_PDIRUNLOCK;
1463 return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
1466 static int disablecwd;
1467 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
1469 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
1470 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
1471 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
1472 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
1473 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
1474 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
1477 __getcwd(struct __getcwd_args *uap)
1487 buflen = uap->buflen;
1490 if (buflen > MAXPATHLEN)
1491 buflen = MAXPATHLEN;
1493 buf = malloc(buflen, M_TEMP, M_WAITOK);
1494 bp = kern_getcwd(buf, buflen, &error);
1496 error = copyout(bp, uap->buf, strlen(bp) + 1);
1502 kern_getcwd(char *buf, size_t buflen, int *error)
1504 struct proc *p = curproc;
1506 int i, slash_prefixed;
1507 struct filedesc *fdp;
1508 struct namecache *ncp;
1517 for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
1518 if (vp->v_flag & VROOT) {
1519 if (vp->v_mount == NULL) { /* forced unmount */
1523 vp = vp->v_mount->mnt_vnodecovered;
1526 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1527 if (ncp->nc_parent && ncp->nc_parent->nc_vp &&
1537 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
1543 *--bp = ncp->nc_name[i];
1552 vp = ncp->nc_parent->nc_vp;
1554 if (!slash_prefixed) {
1568 * Thus begins the fullpath magic.
1572 #define STATNODE(name) \
1573 static u_int name; \
1574 SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
1576 static int disablefullpath;
1577 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
1578 &disablefullpath, 0, "");
1580 STATNODE(numfullpathcalls);
1581 STATNODE(numfullpathfail1);
1582 STATNODE(numfullpathfail2);
1583 STATNODE(numfullpathfail3);
1584 STATNODE(numfullpathfail4);
1585 STATNODE(numfullpathfound);
1588 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, char **freebuf)
1591 int i, slash_prefixed;
1592 struct filedesc *fdp;
1593 struct namecache *ncp;
1597 if (disablefullpath)
1603 /* vn is NULL, client wants us to use p->p_textvp */
1605 if ((vn = p->p_textvp) == NULL)
1609 buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1610 bp = buf + MAXPATHLEN - 1;
1614 for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
1615 if (vp->v_flag & VROOT) {
1616 if (vp->v_mount == NULL) { /* forced unmount */
1620 vp = vp->v_mount->mnt_vnodecovered;
1623 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1624 if (ncp->nc_parent && ncp->nc_parent->nc_vp &&
1634 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
1640 *--bp = ncp->nc_name[i];
1649 vp = ncp->nc_parent->nc_vp;
1651 if (!slash_prefixed) {