gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* Copyright (c) 1989, 1993, 1995
	35	* The Regents of the University of California. All rights reserved.
	36	*
	37	* This code is derived from software contributed to Berkeley by
	38	* Poul-Henning Kamp of the FreeBSD Project.
	39	*
	40	* Redistribution and use in source and binary forms, with or without
	41	* modification, are permitted provided that the following conditions
	42	* are met:
	43	* 1. Redistributions of source code must retain the above copyright
	44	* notice, this list of conditions and the following disclaimer.
	45	* 2. Redistributions in binary form must reproduce the above copyright
	46	* notice, this list of conditions and the following disclaimer in the
	47	* documentation and/or other materials provided with the distribution.
	48	* 3. All advertising materials mentioning features or use of this software
	49	* must display the following acknowledgement:
	50	* This product includes software developed by the University of
	51	* California, Berkeley and its contributors.
	52	* 4. Neither the name of the University nor the names of its contributors
	53	* may be used to endorse or promote products derived from this software
	54	* without specific prior written permission.
	55	*
	56	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	57	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	58	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	59	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	60	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	61	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	62	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	63	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	64	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	65	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	66	* SUCH DAMAGE.
	67	*/
	68
	69	#include <sys/param.h>
	70	#include <sys/systm.h>
	71	#include <sys/kernel.h>
	72	#include <sys/sysctl.h>
	73	#include <sys/mount.h>
	74	#include <sys/vnode.h>
	75	#include <sys/malloc.h>
	76	#include <sys/sysproto.h>
	77	#include <sys/spinlock.h>
	78	#include <sys/proc.h>
	79	#include <sys/namei.h>
	80	#include <sys/nlookup.h>
	81	#include <sys/filedesc.h>
	82	#include <sys/fnv_hash.h>
	83	#include <sys/globaldata.h>
	84	#include <sys/kern_syscall.h>
	85	#include <sys/dirent.h>
	86	#include <ddb/ddb.h>
	87
	88	#include <sys/sysref2.h>
	89	#include <sys/spinlock2.h>
	90	#include <sys/mplock2.h>
	91
	92	#define MAX_RECURSION_DEPTH 64
	93
	94	/*
	95	* Random lookups in the cache are accomplished with a hash table using
	96	* a hash key of (nc_src_vp, name). Each hash chain has its own spin lock.
	97	*
	98	* Negative entries may exist and correspond to resolved namecache
	99	* structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT
	100	* will be set if the entry corresponds to a whited-out directory entry
	101	* (verses simply not finding the entry at all). ncneglist is locked
	102	* with a global spinlock (ncspin).
	103	*
	104	* MPSAFE RULES:
	105	*
	106	* (1) A ncp must be referenced before it can be locked.
	107	*
	108	* (2) A ncp must be locked in order to modify it.
	109	*
	110	* (3) ncp locks are always ordered child -> parent. That may seem
	111	* backwards but forward scans use the hash table and thus can hold
	112	* the parent unlocked when traversing downward.
	113	*
	114	* This allows insert/rename/delete/dot-dot and other operations
	115	* to use ncp->nc_parent links.
	116	*
	117	* This also prevents a locked up e.g. NFS node from creating a
	118	* chain reaction all the way back to the root vnode / namecache.
	119	*
	120	* (4) parent linkages require both the parent and child to be locked.
	121	*/
	122
	123	/*
	124	* Structures associated with name cacheing.
	125	*/
	126	#define NCHHASH(hash) (&nchashtbl[(hash) & nchash])
	127	#define MINNEG 1024
	128	#define MINPOS 1024
	129
	130	MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
	131
	132	LIST_HEAD(nchash_list, namecache);
	133
	134	struct nchash_head {
	135	struct nchash_list list;
	136	struct spinlock spin;
	137	};
	138
	139	static struct nchash_head *nchashtbl;
	140	static struct namecache_list ncneglist;
	141	static struct spinlock ncspin;
	142
	143	/*
	144	* ncvp_debug - debug cache_fromvp(). This is used by the NFS server
	145	* to create the namecache infrastructure leading to a dangling vnode.
	146	*
	147	* 0 Only errors are reported
	148	* 1 Successes are reported
	149	* 2 Successes + the whole directory scan is reported
	150	* 3 Force the directory scan code run as if the parent vnode did not
	151	* have a namecache record, even if it does have one.
	152	*/
	153	static int ncvp_debug;
	154	SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
	155	"Namecache debug level (0-3)");
	156
	157	static u_long nchash; /* size of hash table */
	158	SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
	159	"Size of namecache hash table");
	160
	161	static int ncnegfactor = 16; /* ratio of negative entries */
	162	SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
	163	"Ratio of namecache negative entries");
	164
	165	static int nclockwarn; /* warn on locked entries in ticks */
	166	SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
	167	"Warn on locked namecache entries in ticks");
	168
	169	static int numdefered; /* number of cache entries allocated */
	170	SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
	171	"Number of cache entries allocated");
	172
	173	static int ncposlimit; /* number of cache entries allocated */
	174	SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
	175	"Number of cache entries allocated");
	176
	177	SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
	178	"sizeof(struct vnode)");
	179	SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
	180	"sizeof(struct namecache)");
	181
	182	static int cache_resolve_mp(struct mount *mp);
	183	static struct vnode cache_dvpref(struct namecache ncp);
	184	static void _cache_lock(struct namecache *ncp);
	185	static void _cache_setunresolved(struct namecache *ncp);
	186	static void _cache_cleanneg(int count);
	187	static void _cache_cleanpos(int count);
	188	static void _cache_cleandefered(void);
	189
	190	/*
	191	* The new name cache statistics
	192	*/
	193	SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
	194	static int numneg;
	195	SYSCTL_ULONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
	196	"Number of negative namecache entries");
	197	static int numcache;
	198	SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
	199	"Number of namecaches entries");
	200	static u_long numcalls;
	201	SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0,
	202	"Number of namecache lookups");
	203	static u_long numchecks;
	204	SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0,
	205	"Number of checked entries in namecache lookups");
	206
	207	struct nchstats nchstats[SMP_MAXCPU];
	208	/*
	209	* Export VFS cache effectiveness statistics to user-land.
	210	*
	211	* The statistics are left for aggregation to user-land so
	212	* neat things can be achieved, like observing per-CPU cache
	213	* distribution.
	214	*/
	215	static int
	216	sysctl_nchstats(SYSCTL_HANDLER_ARGS)
	217	{
	218	struct globaldata *gd;
	219	int i, error;
	220
	221	error = 0;
	222	for (i = 0; i < ncpus; ++i) {
	223	gd = globaldata_find(i);
	224	if ((error = SYSCTL_OUT(req, (void )&(gd->gd_nchstats),
	225	sizeof(struct nchstats))))
	226	break;
	227	}
	228
	229	return (error);
	230	}
	231	SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	232	0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
	233
	234	static struct namecache cache_zap(struct namecache ncp, int nonblock);
	235
	236	/*
	237	* Namespace locking. The caller must already hold a reference to the
	238	* namecache structure in order to lock/unlock it. This function prevents
	239	* the namespace from being created or destroyed by accessors other then
	240	* the lock holder.
	241	*
	242	* Note that holding a locked namecache structure prevents other threads
	243	* from making namespace changes (e.g. deleting or creating), prevents
	244	* vnode association state changes by other threads, and prevents the
	245	* namecache entry from being resolved or unresolved by other threads.
	246	*
	247	* The lock owner has full authority to associate/disassociate vnodes
	248	* and resolve/unresolve the locked ncp.
	249	*
	250	* The primary lock field is nc_exlocks. nc_locktd is set after the
	251	* fact (when locking) or cleared prior to unlocking.
	252	*
	253	* WARNING! Holding a locked ncp will prevent a vnode from being destroyed
	254	* or recycled, but it does NOT help you if the vnode had already
	255	* initiated a recyclement. If this is important, use cache_get()
	256	* rather then cache_lock() (and deal with the differences in the
	257	* way the refs counter is handled). Or, alternatively, make an
	258	* unconditional call to cache_validate() or cache_resolve()
	259	* after cache_lock() returns.
	260	*
	261	* MPSAFE
	262	*/
	263	static
	264	void
	265	_cache_lock(struct namecache *ncp)
	266	{
	267	thread_t td;
	268	int didwarn;
	269	int error;
	270	u_int count;
	271
	272	KKASSERT(ncp->nc_refs != 0);
	273	didwarn = 0;
	274	td = curthread;
	275
	276	for (;;) {
	277	count = ncp->nc_exlocks;
	278
	279	if (count == 0) {
	280	if (atomic_cmpset_int(&ncp->nc_exlocks, 0, 1)) {
	281	/*
	282	* The vp associated with a locked ncp must
	283	* be held to prevent it from being recycled.
	284	*
	285	* WARNING! If VRECLAIMED is set the vnode
	286	* could already be in the middle of a recycle.
	287	* Callers must use cache_vref() or
	288	* cache_vget() on the locked ncp to
	289	* validate the vp or set the cache entry
	290	* to unresolved.
	291	*
	292	* NOTE! vhold() is allowed if we hold a
	293	* lock on the ncp (which we do).
	294	*/
	295	ncp->nc_locktd = td;
	296	if (ncp->nc_vp)
	297	vhold(ncp->nc_vp); /* MPSAFE */
	298	break;
	299	}
	300	/* cmpset failed */
	301	continue;
	302	}
	303	if (ncp->nc_locktd == td) {
	304	if (atomic_cmpset_int(&ncp->nc_exlocks, count,
	305	count + 1)) {
	306	break;
	307	}
	308	/* cmpset failed */
	309	continue;
	310	}
	311	tsleep_interlock(ncp, 0);
	312	if (atomic_cmpset_int(&ncp->nc_exlocks, count,
	313	count \| NC_EXLOCK_REQ) == 0) {
	314	/* cmpset failed */
	315	continue;
	316	}
	317	error = tsleep(ncp, PINTERLOCKED, "clock", nclockwarn);
	318	if (error == EWOULDBLOCK) {
	319	if (didwarn == 0) {
	320	didwarn = ticks;
	321	kprintf("[diagnostic] cache_lock: blocked "
	322	"on %p",
	323	ncp);
	324	kprintf(" \"%.s\"\n",
	325	ncp->nc_nlen, ncp->nc_nlen,
	326	ncp->nc_name);
	327	}
	328	}
	329	}
	330	if (didwarn) {
	331	kprintf("[diagnostic] cache_lock: unblocked %.s after "
	332	"%d secs\n",
	333	ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
	334	(int)(ticks - didwarn) / hz);
	335	}
	336	}
	337
	338	/*
	339	* NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
	340	* such as the case where one of its children is locked.
	341	*
	342	* MPSAFE
	343	*/
	344	static
	345	int
	346	_cache_lock_nonblock(struct namecache *ncp)
	347	{
	348	thread_t td;
	349	u_int count;
	350
	351	td = curthread;
	352
	353	for (;;) {
	354	count = ncp->nc_exlocks;
	355
	356	if (count == 0) {
	357	if (atomic_cmpset_int(&ncp->nc_exlocks, 0, 1)) {
	358	/*
	359	* The vp associated with a locked ncp must
	360	* be held to prevent it from being recycled.
	361	*
	362	* WARNING! If VRECLAIMED is set the vnode
	363	* could already be in the middle of a recycle.
	364	* Callers must use cache_vref() or
	365	* cache_vget() on the locked ncp to
	366	* validate the vp or set the cache entry
	367	* to unresolved.
	368	*
	369	* NOTE! vhold() is allowed if we hold a
	370	* lock on the ncp (which we do).
	371	*/
	372	ncp->nc_locktd = td;
	373	if (ncp->nc_vp)
	374	vhold(ncp->nc_vp); /* MPSAFE */
	375	break;
	376	}
	377	/* cmpset failed */
	378	continue;
	379	}
	380	if (ncp->nc_locktd == td) {
	381	if (atomic_cmpset_int(&ncp->nc_exlocks, count,
	382	count + 1)) {
	383	break;
	384	}
	385	/* cmpset failed */
	386	continue;
	387	}
	388	return(EWOULDBLOCK);
	389	}
	390	return(0);
	391	}
	392
	393	/*
	394	* Helper function
	395	*
	396	* NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
	397	*
	398	* nc_locktd must be NULLed out prior to nc_exlocks getting cleared.
	399	*
	400	* MPSAFE
	401	*/
	402	static
	403	void
	404	_cache_unlock(struct namecache *ncp)
	405	{
	406	thread_t td __debugvar = curthread;
	407	u_int count;
	408
	409	KKASSERT(ncp->nc_refs >= 0);
	410	KKASSERT(ncp->nc_exlocks > 0);
	411	KKASSERT(ncp->nc_locktd == td);
	412
	413	count = ncp->nc_exlocks;
	414	if ((count & ~NC_EXLOCK_REQ) == 1) {
	415	ncp->nc_locktd = NULL;
	416	if (ncp->nc_vp)
	417	vdrop(ncp->nc_vp);
	418	}
	419	for (;;) {
	420	if ((count & ~NC_EXLOCK_REQ) == 1) {
	421	if (atomic_cmpset_int(&ncp->nc_exlocks, count, 0)) {
	422	if (count & NC_EXLOCK_REQ)
	423	wakeup(ncp);
	424	break;
	425	}
	426	} else {
	427	if (atomic_cmpset_int(&ncp->nc_exlocks, count,
	428	count - 1)) {
	429	break;
	430	}
	431	}
	432	count = ncp->nc_exlocks;
	433	}
	434	}
	435
	436
	437	/*
	438	* cache_hold() and cache_drop() prevent the premature deletion of a
	439	* namecache entry but do not prevent operations (such as zapping) on
	440	* that namecache entry.
	441	*
	442	* This routine may only be called from outside this source module if
	443	* nc_refs is already at least 1.
	444	*
	445	* This is a rare case where callers are allowed to hold a spinlock,
	446	* so we can't ourselves.
	447	*
	448	* MPSAFE
	449	*/
	450	static __inline
	451	struct namecache *
	452	_cache_hold(struct namecache *ncp)
	453	{
	454	atomic_add_int(&ncp->nc_refs, 1);
	455	return(ncp);
	456	}
	457
	458	/*
	459	* Drop a cache entry, taking care to deal with races.
	460	*
	461	* For potential 1->0 transitions we must hold the ncp lock to safely
	462	* test its flags. An unresolved entry with no children must be zapped
	463	* to avoid leaks.
	464	*
	465	* The call to cache_zap() itself will handle all remaining races and
	466	* will decrement the ncp's refs regardless. If we are resolved or
	467	* have children nc_refs can safely be dropped to 0 without having to
	468	* zap the entry.
	469	*
	470	* NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
	471	*
	472	* NOTE: cache_zap() may return a non-NULL referenced parent which must
	473	* be dropped in a loop.
	474	*
	475	* MPSAFE
	476	*/
	477	static __inline
	478	void
	479	_cache_drop(struct namecache *ncp)
	480	{
	481	int refs;
	482
	483	while (ncp) {
	484	KKASSERT(ncp->nc_refs > 0);
	485	refs = ncp->nc_refs;
	486
	487	if (refs == 1) {
	488	if (_cache_lock_nonblock(ncp) == 0) {
	489	ncp->nc_flag &= ~NCF_DEFEREDZAP;
	490	if ((ncp->nc_flag & NCF_UNRESOLVED) &&
	491	TAILQ_EMPTY(&ncp->nc_list)) {
	492	ncp = cache_zap(ncp, 1);
	493	continue;
	494	}
	495	if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
	496	_cache_unlock(ncp);
	497	break;
	498	}
	499	_cache_unlock(ncp);
	500	}
	501	} else {
	502	if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
	503	break;
	504	}
	505	cpu_pause();
	506	}
	507	}
	508
	509	/*
	510	* Link a new namecache entry to its parent and to the hash table. Be
	511	* careful to avoid races if vhold() blocks in the future.
	512	*
	513	* Both ncp and par must be referenced and locked.
	514	*
	515	* NOTE: The hash table spinlock is likely held during this call, we
	516	* can't do anything fancy.
	517	*
	518	* MPSAFE
	519	*/
	520	static void
	521	_cache_link_parent(struct namecache ncp, struct namecache par,
	522	struct nchash_head *nchpp)
	523	{
	524	KKASSERT(ncp->nc_parent == NULL);
	525	ncp->nc_parent = par;
	526	ncp->nc_head = nchpp;
	527
	528	/*
	529	* Set inheritance flags. Note that the parent flags may be
	530	* stale due to getattr potentially not having been run yet
	531	* (it gets run during nlookup()'s).
	532	*/
	533	ncp->nc_flag &= ~(NCF_SF_PNOCACHE \| NCF_UF_PCACHE);
	534	if (par->nc_flag & (NCF_SF_NOCACHE \| NCF_SF_PNOCACHE))
	535	ncp->nc_flag \|= NCF_SF_PNOCACHE;
	536	if (par->nc_flag & (NCF_UF_CACHE \| NCF_UF_PCACHE))
	537	ncp->nc_flag \|= NCF_UF_PCACHE;
	538
	539	LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
	540
	541	if (TAILQ_EMPTY(&par->nc_list)) {
	542	TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
	543	/*
	544	* Any vp associated with an ncp which has children must
	545	* be held to prevent it from being recycled.
	546	*/
	547	if (par->nc_vp)
	548	vhold(par->nc_vp);
	549	} else {
	550	TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
	551	}
	552	}
	553
	554	/*
	555	* Remove the parent and hash associations from a namecache structure.
	556	* If this is the last child of the parent the cache_drop(par) will
	557	* attempt to recursively zap the parent.
	558	*
	559	* ncp must be locked. This routine will acquire a temporary lock on
	560	* the parent as wlel as the appropriate hash chain.
	561	*
	562	* MPSAFE
	563	*/
	564	static void
	565	_cache_unlink_parent(struct namecache *ncp)
	566	{
	567	struct namecache *par;
	568	struct vnode *dropvp;
	569
	570	if ((par = ncp->nc_parent) != NULL) {
	571	KKASSERT(ncp->nc_parent == par);
	572	_cache_hold(par);
	573	_cache_lock(par);
	574	spin_lock(&ncp->nc_head->spin);
	575	LIST_REMOVE(ncp, nc_hash);
	576	TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
	577	dropvp = NULL;
	578	if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
	579	dropvp = par->nc_vp;
	580	spin_unlock(&ncp->nc_head->spin);
	581	ncp->nc_parent = NULL;
	582	ncp->nc_head = NULL;
	583	_cache_unlock(par);
	584	_cache_drop(par);
	585
	586	/*
	587	* We can only safely vdrop with no spinlocks held.
	588	*/
	589	if (dropvp)
	590	vdrop(dropvp);
	591	}
	592	}
	593
	594	/*
	595	* Allocate a new namecache structure. Most of the code does not require
	596	* zero-termination of the string but it makes vop_compat_ncreate() easier.
	597	*
	598	* MPSAFE
	599	*/
	600	static struct namecache *
	601	cache_alloc(int nlen)
	602	{
	603	struct namecache *ncp;
	604
	605	ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK\|M_ZERO);
	606	if (nlen)
	607	ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
	608	ncp->nc_nlen = nlen;
	609	ncp->nc_flag = NCF_UNRESOLVED;
	610	ncp->nc_error = ENOTCONN; /* needs to be resolved */
	611	ncp->nc_refs = 1;
	612
	613	TAILQ_INIT(&ncp->nc_list);
	614	_cache_lock(ncp);
	615	return(ncp);
	616	}
	617
	618	/*
	619	* Can only be called for the case where the ncp has never been
	620	* associated with anything (so no spinlocks are needed).
	621	*
	622	* MPSAFE
	623	*/
	624	static void
	625	_cache_free(struct namecache *ncp)
	626	{
	627	KKASSERT(ncp->nc_refs == 1 && ncp->nc_exlocks == 1);
	628	if (ncp->nc_name)
	629	kfree(ncp->nc_name, M_VFSCACHE);
	630	kfree(ncp, M_VFSCACHE);
	631	}
	632
	633	/*
	634	* MPSAFE
	635	*/
	636	void
	637	cache_zero(struct nchandle *nch)
	638	{
	639	nch->ncp = NULL;
	640	nch->mount = NULL;
	641	}
	642
	643	/*
	644	* Ref and deref a namecache structure.
	645	*
	646	* The caller must specify a stable ncp pointer, typically meaning the
	647	* ncp is already referenced but this can also occur indirectly through
	648	* e.g. holding a lock on a direct child.
	649	*
	650	* WARNING: Caller may hold an unrelated read spinlock, which means we can't
	651	* use read spinlocks here.
	652	*
	653	* MPSAFE if nch is
	654	*/
	655	struct nchandle *
	656	cache_hold(struct nchandle *nch)
	657	{
	658	_cache_hold(nch->ncp);
	659	atomic_add_int(&nch->mount->mnt_refs, 1);
	660	return(nch);
	661	}
	662
	663	/*
	664	* Create a copy of a namecache handle for an already-referenced
	665	* entry.
	666	*
	667	* MPSAFE if nch is
	668	*/
	669	void
	670	cache_copy(struct nchandle nch, struct nchandle target)
	671	{
	672	target = nch;
	673	if (target->ncp)
	674	_cache_hold(target->ncp);
	675	atomic_add_int(&nch->mount->mnt_refs, 1);
	676	}
	677
	678	/*
	679	* MPSAFE if nch is
	680	*/
	681	void
	682	cache_changemount(struct nchandle nch, struct mount mp)
	683	{
	684	atomic_add_int(&nch->mount->mnt_refs, -1);
	685	nch->mount = mp;
	686	atomic_add_int(&nch->mount->mnt_refs, 1);
	687	}
	688
	689	/*
	690	* MPSAFE
	691	*/
	692	void
	693	cache_drop(struct nchandle *nch)
	694	{
	695	atomic_add_int(&nch->mount->mnt_refs, -1);
	696	_cache_drop(nch->ncp);
	697	nch->ncp = NULL;
	698	nch->mount = NULL;
	699	}
	700
	701	/*
	702	* MPSAFE
	703	*/
	704	void
	705	cache_lock(struct nchandle *nch)
	706	{
	707	_cache_lock(nch->ncp);
	708	}
	709
	710	/*
	711	* Relock nch1 given an unlocked nch1 and a locked nch2. The caller
	712	* is responsible for checking both for validity on return as they
	713	* may have become invalid.
	714	*
	715	* We have to deal with potential deadlocks here, just ping pong
	716	* the lock until we get it (we will always block somewhere when
	717	* looping so this is not cpu-intensive).
	718	*
	719	* which = 0 nch1 not locked, nch2 is locked
	720	* which = 1 nch1 is locked, nch2 is not locked
	721	*/
	722	void
	723	cache_relock(struct nchandle nch1, struct ucred cred1,
	724	struct nchandle nch2, struct ucred cred2)
	725	{
	726	int which;
	727
	728	which = 0;
	729
	730	for (;;) {
	731	if (which == 0) {
	732	if (cache_lock_nonblock(nch1) == 0) {
	733	cache_resolve(nch1, cred1);
	734	break;
	735	}
	736	cache_unlock(nch2);
	737	cache_lock(nch1);
	738	cache_resolve(nch1, cred1);
	739	which = 1;
	740	} else {
	741	if (cache_lock_nonblock(nch2) == 0) {
	742	cache_resolve(nch2, cred2);
	743	break;
	744	}
	745	cache_unlock(nch1);
	746	cache_lock(nch2);
	747	cache_resolve(nch2, cred2);
	748	which = 0;
	749	}
	750	}
	751	}
	752
	753	/*
	754	* MPSAFE
	755	*/
	756	int
	757	cache_lock_nonblock(struct nchandle *nch)
	758	{
	759	return(_cache_lock_nonblock(nch->ncp));
	760	}
	761
	762
	763	/*
	764	* MPSAFE
	765	*/
	766	void
	767	cache_unlock(struct nchandle *nch)
	768	{
	769	_cache_unlock(nch->ncp);
	770	}
	771
	772	/*
	773	* ref-and-lock, unlock-and-deref functions.
	774	*
	775	* This function is primarily used by nlookup. Even though cache_lock
	776	* holds the vnode, it is possible that the vnode may have already
	777	* initiated a recyclement.
	778	*
	779	* We want cache_get() to return a definitively usable vnode or a
	780	* definitively unresolved ncp.
	781	*
	782	* MPSAFE
	783	*/
	784	static
	785	struct namecache *
	786	_cache_get(struct namecache *ncp)
	787	{
	788	_cache_hold(ncp);
	789	_cache_lock(ncp);
	790	if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
	791	_cache_setunresolved(ncp);
	792	return(ncp);
	793	}
	794
	795	/*
	796	* This is a special form of _cache_lock() which only succeeds if
	797	* it can get a pristine, non-recursive lock. The caller must have
	798	* already ref'd the ncp.
	799	*
	800	* On success the ncp will be locked, on failure it will not. The
	801	* ref count does not change either way.
	802	*
	803	* We want _cache_lock_special() (on success) to return a definitively
	804	* usable vnode or a definitively unresolved ncp.
	805	*
	806	* MPSAFE
	807	*/
	808	static int
	809	_cache_lock_special(struct namecache *ncp)
	810	{
	811	if (_cache_lock_nonblock(ncp) == 0) {
	812	if ((ncp->nc_exlocks & ~NC_EXLOCK_REQ) == 1) {
	813	if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
	814	_cache_setunresolved(ncp);
	815	return(0);
	816	}
	817	_cache_unlock(ncp);
	818	}
	819	return(EWOULDBLOCK);
	820	}
	821
	822
	823	/*
	824	* NOTE: The same nchandle can be passed for both arguments.
	825	*
	826	* MPSAFE
	827	*/
	828	void
	829	cache_get(struct nchandle nch, struct nchandle target)
	830	{
	831	KKASSERT(nch->ncp->nc_refs > 0);
	832	target->mount = nch->mount;
	833	target->ncp = _cache_get(nch->ncp);
	834	atomic_add_int(&target->mount->mnt_refs, 1);
	835	}
	836
	837	/*
	838	* MPSAFE
	839	*/
	840	static __inline
	841	void
	842	_cache_put(struct namecache *ncp)
	843	{
	844	_cache_unlock(ncp);
	845	_cache_drop(ncp);
	846	}
	847
	848	/*
	849	* MPSAFE
	850	*/
	851	void
	852	cache_put(struct nchandle *nch)
	853	{
	854	atomic_add_int(&nch->mount->mnt_refs, -1);
	855	_cache_put(nch->ncp);
	856	nch->ncp = NULL;
	857	nch->mount = NULL;
	858	}
	859
	860	/*
	861	* Resolve an unresolved ncp by associating a vnode with it. If the
	862	* vnode is NULL, a negative cache entry is created.
	863	*
	864	* The ncp should be locked on entry and will remain locked on return.
	865	*
	866	* MPSAFE
	867	*/
	868	static
	869	void
	870	_cache_setvp(struct mount mp, struct namecache ncp, struct vnode *vp)
	871	{
	872	KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
	873
	874	if (vp != NULL) {
	875	/*
	876	* Any vp associated with an ncp which has children must
	877	* be held. Any vp associated with a locked ncp must be held.
	878	*/
	879	if (!TAILQ_EMPTY(&ncp->nc_list))
	880	vhold(vp);
	881	spin_lock(&vp->v_spin);
	882	ncp->nc_vp = vp;
	883	TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
	884	spin_unlock(&vp->v_spin);
	885	if (ncp->nc_exlocks)
	886	vhold(vp);
	887
	888	/*
	889	* Set auxiliary flags
	890	*/
	891	switch(vp->v_type) {
	892	case VDIR:
	893	ncp->nc_flag \|= NCF_ISDIR;
	894	break;
	895	case VLNK:
	896	ncp->nc_flag \|= NCF_ISSYMLINK;
	897	/* XXX cache the contents of the symlink */
	898	break;
	899	default:
	900	break;
	901	}
	902	atomic_add_int(&numcache, 1);
	903	ncp->nc_error = 0;
	904	/* XXX: this is a hack to work-around the lack of a real pfs vfs
	905	* implementation*/
	906	if (mp != NULL)
	907	vp->v_pfsmp = mp;
	908	} else {
	909	/*
	910	* When creating a negative cache hit we set the
	911	* namecache_gen. A later resolve will clean out the
	912	* negative cache hit if the mount point's namecache_gen
	913	* has changed. Used by devfs, could also be used by
	914	* other remote FSs.
	915	*/
	916	ncp->nc_vp = NULL;
	917	spin_lock(&ncspin);
	918	TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
	919	++numneg;
	920	spin_unlock(&ncspin);
	921	ncp->nc_error = ENOENT;
	922	if (mp)
	923	ncp->nc_namecache_gen = mp->mnt_namecache_gen;
	924	}
	925	ncp->nc_flag &= ~(NCF_UNRESOLVED \| NCF_DEFEREDZAP);
	926	}
	927
	928	/*
	929	* MPSAFE
	930	*/
	931	void
	932	cache_setvp(struct nchandle nch, struct vnode vp)
	933	{
	934	_cache_setvp(nch->mount, nch->ncp, vp);
	935	}
	936
	937	/*
	938	* MPSAFE
	939	*/
	940	void
	941	cache_settimeout(struct nchandle *nch, int nticks)
	942	{
	943	struct namecache *ncp = nch->ncp;
	944
	945	if ((ncp->nc_timeout = ticks + nticks) == 0)
	946	ncp->nc_timeout = 1;
	947	}
	948
	949	/*
	950	* Disassociate the vnode or negative-cache association and mark a
	951	* namecache entry as unresolved again. Note that the ncp is still
	952	* left in the hash table and still linked to its parent.
	953	*
	954	* The ncp should be locked and refd on entry and will remain locked and refd
	955	* on return.
	956	*
	957	* This routine is normally never called on a directory containing children.
	958	* However, NFS often does just that in its rename() code as a cop-out to
	959	* avoid complex namespace operations. This disconnects a directory vnode
	960	* from its namecache and can cause the OLDAPI and NEWAPI to get out of
	961	* sync.
	962	*
	963	* MPSAFE
	964	*/
	965	static
	966	void
	967	_cache_setunresolved(struct namecache *ncp)
	968	{
	969	struct vnode *vp;
	970
	971	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
	972	ncp->nc_flag \|= NCF_UNRESOLVED;
	973	ncp->nc_timeout = 0;
	974	ncp->nc_error = ENOTCONN;
	975	if ((vp = ncp->nc_vp) != NULL) {
	976	atomic_add_int(&numcache, -1);
	977	spin_lock(&vp->v_spin);
	978	ncp->nc_vp = NULL;
	979	TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
	980	spin_unlock(&vp->v_spin);
	981
	982	/*
	983	* Any vp associated with an ncp with children is
	984	* held by that ncp. Any vp associated with a locked
	985	* ncp is held by that ncp. These conditions must be
	986	* undone when the vp is cleared out from the ncp.
	987	*/
	988	if (!TAILQ_EMPTY(&ncp->nc_list))
	989	vdrop(vp);
	990	if (ncp->nc_exlocks)
	991	vdrop(vp);
	992	} else {
	993	spin_lock(&ncspin);
	994	TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
	995	--numneg;
	996	spin_unlock(&ncspin);
	997	}
	998	ncp->nc_flag &= ~(NCF_WHITEOUT\|NCF_ISDIR\|NCF_ISSYMLINK);
	999	}
	1000	}
	1001
	1002	/*
	1003	* The cache_nresolve() code calls this function to automatically
	1004	* set a resolved cache element to unresolved if it has timed out
	1005	* or if it is a negative cache hit and the mount point namecache_gen
	1006	* has changed.
	1007	*
	1008	* MPSAFE
	1009	*/
	1010	static __inline void
	1011	_cache_auto_unresolve(struct mount mp, struct namecache ncp)
	1012	{
	1013	/*
	1014	* Already in an unresolved state, nothing to do.
	1015	*/
	1016	if (ncp->nc_flag & NCF_UNRESOLVED)
	1017	return;
	1018
	1019	/*
	1020	* Try to zap entries that have timed out. We have
	1021	* to be careful here because locked leafs may depend
	1022	* on the vnode remaining intact in a parent, so only
	1023	* do this under very specific conditions.
	1024	*/
	1025	if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
	1026	TAILQ_EMPTY(&ncp->nc_list)) {
	1027	_cache_setunresolved(ncp);
	1028	return;
	1029	}
	1030
	1031	/*
	1032	* If a resolved negative cache hit is invalid due to
	1033	* the mount's namecache generation being bumped, zap it.
	1034	*/
	1035	if (ncp->nc_vp == NULL &&
	1036	ncp->nc_namecache_gen != mp->mnt_namecache_gen) {
	1037	_cache_setunresolved(ncp);
	1038	return;
	1039	}
	1040	}
	1041
	1042	/*
	1043	* MPSAFE
	1044	*/
	1045	void
	1046	cache_setunresolved(struct nchandle *nch)
	1047	{
	1048	_cache_setunresolved(nch->ncp);
	1049	}
	1050
	1051	/*
	1052	* Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
	1053	* looking for matches. This flag tells the lookup code when it must
	1054	* check for a mount linkage and also prevents the directories in question
	1055	* from being deleted or renamed.
	1056	*
	1057	* MPSAFE
	1058	*/
	1059	static
	1060	int
	1061	cache_clrmountpt_callback(struct mount mp, void data)
	1062	{
	1063	struct nchandle *nch = data;
	1064
	1065	if (mp->mnt_ncmounton.ncp == nch->ncp)
	1066	return(1);
	1067	if (mp->mnt_ncmountpt.ncp == nch->ncp)
	1068	return(1);
	1069	return(0);
	1070	}
	1071
	1072	/*
	1073	* MPSAFE
	1074	*/
	1075	void
	1076	cache_clrmountpt(struct nchandle *nch)
	1077	{
	1078	int count;
	1079
	1080	count = mountlist_scan(cache_clrmountpt_callback, nch,
	1081	MNTSCAN_FORWARD\|MNTSCAN_NOBUSY);
	1082	if (count == 0)
	1083	nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
	1084	}
	1085
	1086	/*
	1087	* Invalidate portions of the namecache topology given a starting entry.
	1088	* The passed ncp is set to an unresolved state and:
	1089	*
	1090	* The passed ncp must be referencxed and locked. The routine may unlock
	1091	* and relock ncp several times, and will recheck the children and loop
	1092	* to catch races. When done the passed ncp will be returned with the
	1093	* reference and lock intact.
	1094	*
	1095	* CINV_DESTROY - Set a flag in the passed ncp entry indicating
	1096	* that the physical underlying nodes have been
	1097	* destroyed... as in deleted. For example, when
	1098	* a directory is removed. This will cause record
	1099	* lookups on the name to no longer be able to find
	1100	* the record and tells the resolver to return failure
	1101	* rather then trying to resolve through the parent.
	1102	*
	1103	* The topology itself, including ncp->nc_name,
	1104	* remains intact.
	1105	*
	1106	* This only applies to the passed ncp, if CINV_CHILDREN
	1107	* is specified the children are not flagged.
	1108	*
	1109	* CINV_CHILDREN - Set all children (recursively) to an unresolved
	1110	* state as well.
	1111	*
	1112	* Note that this will also have the side effect of
	1113	* cleaning out any unreferenced nodes in the topology
	1114	* from the leaves up as the recursion backs out.
	1115	*
	1116	* Note that the topology for any referenced nodes remains intact, but
	1117	* the nodes will be marked as having been destroyed and will be set
	1118	* to an unresolved state.
	1119	*
	1120	* It is possible for cache_inval() to race a cache_resolve(), meaning that
	1121	* the namecache entry may not actually be invalidated on return if it was
	1122	* revalidated while recursing down into its children. This code guarentees
	1123	* that the node(s) will go through an invalidation cycle, but does not
	1124	* guarentee that they will remain in an invalidated state.
	1125	*
	1126	* Returns non-zero if a revalidation was detected during the invalidation
	1127	* recursion, zero otherwise. Note that since only the original ncp is
	1128	* locked the revalidation ultimately can only indicate that the original ncp
	1129	* MIGHT no have been reresolved.
	1130	*
	1131	* DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
	1132	* have to avoid blowing out the kernel stack. We do this by saving the
	1133	* deep namecache node and aborting the recursion, then re-recursing at that
	1134	* node using a depth-first algorithm in order to allow multiple deep
	1135	* recursions to chain through each other, then we restart the invalidation
	1136	* from scratch.
	1137	*
	1138	* MPSAFE
	1139	*/
	1140
	1141	struct cinvtrack {
	1142	struct namecache *resume_ncp;
	1143	int depth;
	1144	};
	1145
	1146	static int _cache_inval_internal(struct namecache , int, struct cinvtrack );
	1147
	1148	static
	1149	int
	1150	_cache_inval(struct namecache *ncp, int flags)
	1151	{
	1152	struct cinvtrack track;
	1153	struct namecache *ncp2;
	1154	int r;
	1155
	1156	track.depth = 0;
	1157	track.resume_ncp = NULL;
	1158
	1159	for (;;) {
	1160	r = _cache_inval_internal(ncp, flags, &track);
	1161	if (track.resume_ncp == NULL)
	1162	break;
	1163	kprintf("Warning: deep namecache recursion at %s\n",
	1164	ncp->nc_name);
	1165	_cache_unlock(ncp);
	1166	while ((ncp2 = track.resume_ncp) != NULL) {
	1167	track.resume_ncp = NULL;
	1168	_cache_lock(ncp2);
	1169	_cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
	1170	&track);
	1171	_cache_put(ncp2);
	1172	}
	1173	_cache_lock(ncp);
	1174	}
	1175	return(r);
	1176	}
	1177
	1178	int
	1179	cache_inval(struct nchandle *nch, int flags)
	1180	{
	1181	return(_cache_inval(nch->ncp, flags));
	1182	}
	1183
	1184	/*
	1185	* Helper for _cache_inval(). The passed ncp is refd and locked and
	1186	* remains that way on return, but may be unlocked/relocked multiple
	1187	* times by the routine.
	1188	*/
	1189	static int
	1190	_cache_inval_internal(struct namecache ncp, int flags, struct cinvtrack track)
	1191	{
	1192	struct namecache *kid;
	1193	struct namecache *nextkid;
	1194	int rcnt = 0;
	1195
	1196	KKASSERT(ncp->nc_exlocks);
	1197
	1198	_cache_setunresolved(ncp);
	1199	if (flags & CINV_DESTROY)
	1200	ncp->nc_flag \|= NCF_DESTROYED;
	1201	if ((flags & CINV_CHILDREN) &&
	1202	(kid = TAILQ_FIRST(&ncp->nc_list)) != NULL
	1203	) {
	1204	_cache_hold(kid);
	1205	if (++track->depth > MAX_RECURSION_DEPTH) {
	1206	track->resume_ncp = ncp;
	1207	_cache_hold(ncp);
	1208	++rcnt;
	1209	}
	1210	_cache_unlock(ncp);
	1211	while (kid) {
	1212	if (track->resume_ncp) {
	1213	_cache_drop(kid);
	1214	break;
	1215	}
	1216	if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
	1217	_cache_hold(nextkid);
	1218	if ((kid->nc_flag & NCF_UNRESOLVED) == 0 \|\|
	1219	TAILQ_FIRST(&kid->nc_list)
	1220	) {
	1221	_cache_lock(kid);
	1222	rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
	1223	_cache_unlock(kid);
	1224	}
	1225	_cache_drop(kid);
	1226	kid = nextkid;
	1227	}
	1228	--track->depth;
	1229	_cache_lock(ncp);
	1230	}
	1231
	1232	/*
	1233	* Someone could have gotten in there while ncp was unlocked,
	1234	* retry if so.
	1235	*/
	1236	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
	1237	++rcnt;
	1238	return (rcnt);
	1239	}
	1240
	1241	/*
	1242	* Invalidate a vnode's namecache associations. To avoid races against
	1243	* the resolver we do not invalidate a node which we previously invalidated
	1244	* but which was then re-resolved while we were in the invalidation loop.
	1245	*
	1246	* Returns non-zero if any namecache entries remain after the invalidation
	1247	* loop completed.
	1248	*
	1249	* NOTE: Unlike the namecache topology which guarentees that ncp's will not
	1250	* be ripped out of the topology while held, the vnode's v_namecache
	1251	* list has no such restriction. NCP's can be ripped out of the list
	1252	* at virtually any time if not locked, even if held.
	1253	*
	1254	* In addition, the v_namecache list itself must be locked via
	1255	* the vnode's spinlock.
	1256	*
	1257	* MPSAFE
	1258	*/
	1259	int
	1260	cache_inval_vp(struct vnode *vp, int flags)
	1261	{
	1262	struct namecache *ncp;
	1263	struct namecache *next;
	1264
	1265	restart:
	1266	spin_lock(&vp->v_spin);
	1267	ncp = TAILQ_FIRST(&vp->v_namecache);
	1268	if (ncp)
	1269	_cache_hold(ncp);
	1270	while (ncp) {
	1271	/* loop entered with ncp held and vp spin-locked */
	1272	if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
	1273	_cache_hold(next);
	1274	spin_unlock(&vp->v_spin);
	1275	_cache_lock(ncp);
	1276	if (ncp->nc_vp != vp) {
	1277	kprintf("Warning: cache_inval_vp: race-A detected on "
	1278	"%s\n", ncp->nc_name);
	1279	_cache_put(ncp);
	1280	if (next)
	1281	_cache_drop(next);
	1282	goto restart;
	1283	}
	1284	_cache_inval(ncp, flags);
	1285	_cache_put(ncp); /* also releases reference */
	1286	ncp = next;
	1287	spin_lock(&vp->v_spin);
	1288	if (ncp && ncp->nc_vp != vp) {
	1289	spin_unlock(&vp->v_spin);
	1290	kprintf("Warning: cache_inval_vp: race-B detected on "
	1291	"%s\n", ncp->nc_name);
	1292	_cache_drop(ncp);
	1293	goto restart;
	1294	}
	1295	}
	1296	spin_unlock(&vp->v_spin);
	1297	return(TAILQ_FIRST(&vp->v_namecache) != NULL);
	1298	}
	1299
	1300	/*
	1301	* This routine is used instead of the normal cache_inval_vp() when we
	1302	* are trying to recycle otherwise good vnodes.
	1303	*
	1304	* Return 0 on success, non-zero if not all namecache records could be
	1305	* disassociated from the vnode (for various reasons).
	1306	*
	1307	* MPSAFE
	1308	*/
	1309	int
	1310	cache_inval_vp_nonblock(struct vnode *vp)
	1311	{
	1312	struct namecache *ncp;
	1313	struct namecache *next;
	1314
	1315	spin_lock(&vp->v_spin);
	1316	ncp = TAILQ_FIRST(&vp->v_namecache);
	1317	if (ncp)
	1318	_cache_hold(ncp);
	1319	while (ncp) {
	1320	/* loop entered with ncp held */
	1321	if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
	1322	_cache_hold(next);
	1323	spin_unlock(&vp->v_spin);
	1324	if (_cache_lock_nonblock(ncp)) {
	1325	_cache_drop(ncp);
	1326	if (next)
	1327	_cache_drop(next);
	1328	goto done;
	1329	}
	1330	if (ncp->nc_vp != vp) {
	1331	kprintf("Warning: cache_inval_vp: race-A detected on "
	1332	"%s\n", ncp->nc_name);
	1333	_cache_put(ncp);
	1334	if (next)
	1335	_cache_drop(next);
	1336	goto done;
	1337	}
	1338	_cache_inval(ncp, 0);
	1339	_cache_put(ncp); /* also releases reference */
	1340	ncp = next;
	1341	spin_lock(&vp->v_spin);
	1342	if (ncp && ncp->nc_vp != vp) {
	1343	spin_unlock(&vp->v_spin);
	1344	kprintf("Warning: cache_inval_vp: race-B detected on "
	1345	"%s\n", ncp->nc_name);
	1346	_cache_drop(ncp);
	1347	goto done;
	1348	}
	1349	}
	1350	spin_unlock(&vp->v_spin);
	1351	done:
	1352	return(TAILQ_FIRST(&vp->v_namecache) != NULL);
	1353	}
	1354
	1355	/*
	1356	* The source ncp has been renamed to the target ncp. Both fncp and tncp
	1357	* must be locked. The target ncp is destroyed (as a normal rename-over
	1358	* would destroy the target file or directory).
	1359	*
	1360	* Because there may be references to the source ncp we cannot copy its
	1361	* contents to the target. Instead the source ncp is relinked as the target
	1362	* and the target ncp is removed from the namecache topology.
	1363	*
	1364	* MPSAFE
	1365	*/
	1366	void
	1367	cache_rename(struct nchandle fnch, struct nchandle tnch)
	1368	{
	1369	struct namecache *fncp = fnch->ncp;
	1370	struct namecache *tncp = tnch->ncp;
	1371	struct namecache *tncp_par;
	1372	struct nchash_head *nchpp;
	1373	u_int32_t hash;
	1374	char *oname;
	1375
	1376	/*
	1377	* Rename fncp (unlink)
	1378	*/
	1379	_cache_unlink_parent(fncp);
	1380	oname = fncp->nc_name;
	1381	fncp->nc_name = tncp->nc_name;
	1382	fncp->nc_nlen = tncp->nc_nlen;
	1383	tncp_par = tncp->nc_parent;
	1384	_cache_hold(tncp_par);
	1385	_cache_lock(tncp_par);
	1386
	1387	/*
	1388	* Rename fncp (relink)
	1389	*/
	1390	hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
	1391	hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
	1392	nchpp = NCHHASH(hash);
	1393
	1394	spin_lock(&nchpp->spin);
	1395	_cache_link_parent(fncp, tncp_par, nchpp);
	1396	spin_unlock(&nchpp->spin);
	1397
	1398	_cache_put(tncp_par);
	1399
	1400	/*
	1401	* Get rid of the overwritten tncp (unlink)
	1402	*/
	1403	_cache_setunresolved(tncp);
	1404	_cache_unlink_parent(tncp);
	1405	tncp->nc_name = NULL;
	1406	tncp->nc_nlen = 0;
	1407
	1408	if (oname)
	1409	kfree(oname, M_VFSCACHE);
	1410	}
	1411
	1412	/*
	1413	* vget the vnode associated with the namecache entry. Resolve the namecache
	1414	* entry if necessary. The passed ncp must be referenced and locked.
	1415	*
	1416	* lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked
	1417	* (depending on the passed lk_type) will be returned in *vpp with an error
	1418	* of 0, or NULL will be returned in *vpp with a non-0 error code. The
	1419	* most typical error is ENOENT, meaning that the ncp represents a negative
	1420	* cache hit and there is no vnode to retrieve, but other errors can occur
	1421	* too.
	1422	*
	1423	* The vget() can race a reclaim. If this occurs we re-resolve the
	1424	* namecache entry.
	1425	*
	1426	* There are numerous places in the kernel where vget() is called on a
	1427	* vnode while one or more of its namecache entries is locked. Releasing
	1428	* a vnode never deadlocks against locked namecache entries (the vnode
	1429	* will not get recycled while referenced ncp's exist). This means we
	1430	* can safely acquire the vnode. In fact, we MUST NOT release the ncp
	1431	* lock when acquiring the vp lock or we might cause a deadlock.
	1432	*
	1433	* MPSAFE
	1434	*/
	1435	int
	1436	cache_vget(struct nchandle nch, struct ucred cred,
	1437	int lk_type, struct vnode **vpp)
	1438	{
	1439	struct namecache *ncp;
	1440	struct vnode *vp;
	1441	int error;
	1442
	1443	ncp = nch->ncp;
	1444	KKASSERT(ncp->nc_locktd == curthread);
	1445	again:
	1446	vp = NULL;
	1447	if (ncp->nc_flag & NCF_UNRESOLVED)
	1448	error = cache_resolve(nch, cred);
	1449	else
	1450	error = 0;
	1451
	1452	if (error == 0 && (vp = ncp->nc_vp) != NULL) {
	1453	error = vget(vp, lk_type);
	1454	if (error) {
	1455	/*
	1456	* VRECLAIM race
	1457	*/
	1458	if (error == ENOENT) {
	1459	kprintf("Warning: vnode reclaim race detected "
	1460	"in cache_vget on %p (%s)\n",
	1461	vp, ncp->nc_name);
	1462	_cache_setunresolved(ncp);
	1463	goto again;
	1464	}
	1465
	1466	/*
	1467	* Not a reclaim race, some other error.
	1468	*/
	1469	KKASSERT(ncp->nc_vp == vp);
	1470	vp = NULL;
	1471	} else {
	1472	KKASSERT(ncp->nc_vp == vp);
	1473	KKASSERT((vp->v_flag & VRECLAIMED) == 0);
	1474	}
	1475	}
	1476	if (error == 0 && vp == NULL)
	1477	error = ENOENT;
	1478	*vpp = vp;
	1479	return(error);
	1480	}
	1481
	1482	int
	1483	cache_vref(struct nchandle nch, struct ucred cred, struct vnode **vpp)
	1484	{
	1485	struct namecache *ncp;
	1486	struct vnode *vp;
	1487	int error;
	1488
	1489	ncp = nch->ncp;
	1490	KKASSERT(ncp->nc_locktd == curthread);
	1491	again:
	1492	vp = NULL;
	1493	if (ncp->nc_flag & NCF_UNRESOLVED)
	1494	error = cache_resolve(nch, cred);
	1495	else
	1496	error = 0;
	1497
	1498	if (error == 0 && (vp = ncp->nc_vp) != NULL) {
	1499	error = vget(vp, LK_SHARED);
	1500	if (error) {
	1501	/*
	1502	* VRECLAIM race
	1503	*/
	1504	if (error == ENOENT) {
	1505	kprintf("Warning: vnode reclaim race detected "
	1506	"in cache_vget on %p (%s)\n",
	1507	vp, ncp->nc_name);
	1508	_cache_setunresolved(ncp);
	1509	goto again;
	1510	}
	1511
	1512	/*
	1513	* Not a reclaim race, some other error.
	1514	*/
	1515	KKASSERT(ncp->nc_vp == vp);
	1516	vp = NULL;
	1517	} else {
	1518	KKASSERT(ncp->nc_vp == vp);
	1519	KKASSERT((vp->v_flag & VRECLAIMED) == 0);
	1520	/* caller does not want a lock */
	1521	vn_unlock(vp);
	1522	}
	1523	}
	1524	if (error == 0 && vp == NULL)
	1525	error = ENOENT;
	1526	*vpp = vp;
	1527	return(error);
	1528	}
	1529
	1530	/*
	1531	* Return a referenced vnode representing the parent directory of
	1532	* ncp.
	1533	*
	1534	* Because the caller has locked the ncp it should not be possible for
	1535	* the parent ncp to go away. However, the parent can unresolve its
	1536	* dvp at any time so we must be able to acquire a lock on the parent
	1537	* to safely access nc_vp.
	1538	*
	1539	* We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
	1540	* so use vhold()/vdrop() while holding the lock to prevent dvp from
	1541	* getting destroyed.
	1542	*
	1543	* MPSAFE - Note vhold() is allowed when dvp has 0 refs if we hold a
	1544	* lock on the ncp in question..
	1545	*/
	1546	static struct vnode *
	1547	cache_dvpref(struct namecache *ncp)
	1548	{
	1549	struct namecache *par;
	1550	struct vnode *dvp;
	1551
	1552	dvp = NULL;
	1553	if ((par = ncp->nc_parent) != NULL) {
	1554	_cache_hold(par);
	1555	_cache_lock(par);
	1556	if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
	1557	if ((dvp = par->nc_vp) != NULL)
	1558	vhold(dvp);
	1559	}
	1560	_cache_unlock(par);
	1561	if (dvp) {
	1562	if (vget(dvp, LK_SHARED) == 0) {
	1563	vn_unlock(dvp);
	1564	vdrop(dvp);
	1565	/* return refd, unlocked dvp */
	1566	} else {
	1567	vdrop(dvp);
	1568	dvp = NULL;
	1569	}
	1570	}
	1571	_cache_drop(par);
	1572	}
	1573	return(dvp);
	1574	}
	1575
	1576	/*
	1577	* Convert a directory vnode to a namecache record without any other
	1578	* knowledge of the topology. This ONLY works with directory vnodes and
	1579	* is ONLY used by the NFS server. dvp must be refd but unlocked, and the
	1580	* returned ncp (if not NULL) will be held and unlocked.
	1581	*
	1582	* If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
	1583	* If 'makeit' is 1 we attempt to track-down and create the namecache topology
	1584	* for dvp. This will fail only if the directory has been deleted out from
	1585	* under the caller.
	1586	*
	1587	* Callers must always check for a NULL return no matter the value of 'makeit'.
	1588	*
	1589	* To avoid underflowing the kernel stack each recursive call increments
	1590	* the makeit variable.
	1591	*/
	1592
	1593	static int cache_inefficient_scan(struct nchandle nch, struct ucred cred,
	1594	struct vnode dvp, char fakename);
	1595	static int cache_fromdvp_try(struct vnode dvp, struct ucred cred,
	1596	struct vnode **saved_dvp);
	1597
	1598	int
	1599	cache_fromdvp(struct vnode dvp, struct ucred cred, int makeit,
	1600	struct nchandle *nch)
	1601	{
	1602	struct vnode *saved_dvp;
	1603	struct vnode *pvp;
	1604	char *fakename;
	1605	int error;
	1606
	1607	nch->ncp = NULL;
	1608	nch->mount = dvp->v_mount;
	1609	saved_dvp = NULL;
	1610	fakename = NULL;
	1611
	1612	/*
	1613	* Handle the makeit == 0 degenerate case
	1614	*/
	1615	if (makeit == 0) {
	1616	spin_lock(&dvp->v_spin);
	1617	nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
	1618	if (nch->ncp)
	1619	cache_hold(nch);
	1620	spin_unlock(&dvp->v_spin);
	1621	}
	1622
	1623	/*
	1624	* Loop until resolution, inside code will break out on error.
	1625	*/
	1626	while (makeit) {
	1627	/*
	1628	* Break out if we successfully acquire a working ncp.
	1629	*/
	1630	spin_lock(&dvp->v_spin);
	1631	nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
	1632	if (nch->ncp) {
	1633	cache_hold(nch);
	1634	spin_unlock(&dvp->v_spin);
	1635	break;
	1636	}
	1637	spin_unlock(&dvp->v_spin);
	1638
	1639	/*
	1640	* If dvp is the root of its filesystem it should already
	1641	* have a namecache pointer associated with it as a side
	1642	* effect of the mount, but it may have been disassociated.
	1643	*/
	1644	if (dvp->v_flag & VROOT) {
	1645	nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
	1646	error = cache_resolve_mp(nch->mount);
	1647	_cache_put(nch->ncp);
	1648	if (ncvp_debug) {
	1649	kprintf("cache_fromdvp: resolve root of mount %p error %d",
	1650	dvp->v_mount, error);
	1651	}
	1652	if (error) {
	1653	if (ncvp_debug)
	1654	kprintf(" failed\n");
	1655	nch->ncp = NULL;
	1656	break;
	1657	}
	1658	if (ncvp_debug)
	1659	kprintf(" succeeded\n");
	1660	continue;
	1661	}
	1662
	1663	/*
	1664	* If we are recursed too deeply resort to an O(n^2)
	1665	* algorithm to resolve the namecache topology. The
	1666	* resolved pvp is left referenced in saved_dvp to
	1667	* prevent the tree from being destroyed while we loop.
	1668	*/
	1669	if (makeit > 20) {
	1670	error = cache_fromdvp_try(dvp, cred, &saved_dvp);
	1671	if (error) {
	1672	kprintf("lookupdotdot(longpath) failed %d "
	1673	"dvp %p\n", error, dvp);
	1674	nch->ncp = NULL;
	1675	break;
	1676	}
	1677	continue;
	1678	}
	1679
	1680	/*
	1681	* Get the parent directory and resolve its ncp.
	1682	*/
	1683	if (fakename) {
	1684	kfree(fakename, M_TEMP);
	1685	fakename = NULL;
	1686	}
	1687	error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
	1688	&fakename);
	1689	if (error) {
	1690	kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
	1691	break;
	1692	}
	1693	vn_unlock(pvp);
	1694
	1695	/*
	1696	* Reuse makeit as a recursion depth counter. On success
	1697	* nch will be fully referenced.
	1698	*/
	1699	cache_fromdvp(pvp, cred, makeit + 1, nch);
	1700	vrele(pvp);
	1701	if (nch->ncp == NULL)
	1702	break;
	1703
	1704	/*
	1705	* Do an inefficient scan of pvp (embodied by ncp) to look
	1706	* for dvp. This will create a namecache record for dvp on
	1707	* success. We loop up to recheck on success.
	1708	*
	1709	* ncp and dvp are both held but not locked.
	1710	*/
	1711	error = cache_inefficient_scan(nch, cred, dvp, fakename);
	1712	if (error) {
	1713	kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
	1714	pvp, nch->ncp->nc_name, dvp);
	1715	cache_drop(nch);
	1716	/* nch was NULLed out, reload mount */
	1717	nch->mount = dvp->v_mount;
	1718	break;
	1719	}
	1720	if (ncvp_debug) {
	1721	kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
	1722	pvp, nch->ncp->nc_name);
	1723	}
	1724	cache_drop(nch);
	1725	/* nch was NULLed out, reload mount */
	1726	nch->mount = dvp->v_mount;
	1727	}
	1728
	1729	/*
	1730	* If nch->ncp is non-NULL it will have been held already.
	1731	*/
	1732	if (fakename)
	1733	kfree(fakename, M_TEMP);
	1734	if (saved_dvp)
	1735	vrele(saved_dvp);
	1736	if (nch->ncp)
	1737	return (0);
	1738	return (EINVAL);
	1739	}
	1740
	1741	/*
	1742	* Go up the chain of parent directories until we find something
	1743	* we can resolve into the namecache. This is very inefficient.
	1744	*/
	1745	static
	1746	int
	1747	cache_fromdvp_try(struct vnode dvp, struct ucred cred,
	1748	struct vnode **saved_dvp)
	1749	{
	1750	struct nchandle nch;
	1751	struct vnode *pvp;
	1752	int error;
	1753	static time_t last_fromdvp_report;
	1754	char *fakename;
	1755
	1756	/*
	1757	* Loop getting the parent directory vnode until we get something we
	1758	* can resolve in the namecache.
	1759	*/
	1760	vref(dvp);
	1761	nch.mount = dvp->v_mount;
	1762	nch.ncp = NULL;
	1763	fakename = NULL;
	1764
	1765	for (;;) {
	1766	if (fakename) {
	1767	kfree(fakename, M_TEMP);
	1768	fakename = NULL;
	1769	}
	1770	error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
	1771	&fakename);
	1772	if (error) {
	1773	vrele(dvp);
	1774	break;
	1775	}
	1776	vn_unlock(pvp);
	1777	spin_lock(&pvp->v_spin);
	1778	if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
	1779	_cache_hold(nch.ncp);
	1780	spin_unlock(&pvp->v_spin);
	1781	vrele(pvp);
	1782	break;
	1783	}
	1784	spin_unlock(&pvp->v_spin);
	1785	if (pvp->v_flag & VROOT) {
	1786	nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
	1787	error = cache_resolve_mp(nch.mount);
	1788	_cache_unlock(nch.ncp);
	1789	vrele(pvp);
	1790	if (error) {
	1791	_cache_drop(nch.ncp);
	1792	nch.ncp = NULL;
	1793	vrele(dvp);
	1794	}
	1795	break;
	1796	}
	1797	vrele(dvp);
	1798	dvp = pvp;
	1799	}
	1800	if (error == 0) {
	1801	if (last_fromdvp_report != time_second) {
	1802	last_fromdvp_report = time_second;
	1803	kprintf("Warning: extremely inefficient path "
	1804	"resolution on %s\n",
	1805	nch.ncp->nc_name);
	1806	}
	1807	error = cache_inefficient_scan(&nch, cred, dvp, fakename);
	1808
	1809	/*
	1810	* Hopefully dvp now has a namecache record associated with
	1811	* it. Leave it referenced to prevent the kernel from
	1812	* recycling the vnode. Otherwise extremely long directory
	1813	* paths could result in endless recycling.
	1814	*/
	1815	if (*saved_dvp)
	1816	vrele(*saved_dvp);
	1817	*saved_dvp = dvp;
	1818	_cache_drop(nch.ncp);
	1819	}
	1820	if (fakename)
	1821	kfree(fakename, M_TEMP);
	1822	return (error);
	1823	}
	1824
	1825	/*
	1826	* Do an inefficient scan of the directory represented by ncp looking for
	1827	* the directory vnode dvp. ncp must be held but not locked on entry and
	1828	* will be held on return. dvp must be refd but not locked on entry and
	1829	* will remain refd on return.
	1830	*
	1831	* Why do this at all? Well, due to its stateless nature the NFS server
	1832	* converts file handles directly to vnodes without necessarily going through
	1833	* the namecache ops that would otherwise create the namecache topology
	1834	* leading to the vnode. We could either (1) Change the namecache algorithms
	1835	* to allow disconnect namecache records that are re-merged opportunistically,
	1836	* or (2) Make the NFS server backtrack and scan to recover a connected
	1837	* namecache topology in order to then be able to issue new API lookups.
	1838	*
	1839	* It turns out that (1) is a huge mess. It takes a nice clean set of
	1840	* namecache algorithms and introduces a lot of complication in every subsystem
	1841	* that calls into the namecache to deal with the re-merge case, especially
	1842	* since we are using the namecache to placehold negative lookups and the
	1843	* vnode might not be immediately assigned. (2) is certainly far less
	1844	* efficient then (1), but since we are only talking about directories here
	1845	* (which are likely to remain cached), the case does not actually run all
	1846	* that often and has the supreme advantage of not polluting the namecache
	1847	* algorithms.
	1848	*
	1849	* If a fakename is supplied just construct a namecache entry using the
	1850	* fake name.
	1851	*/
	1852	static int
	1853	cache_inefficient_scan(struct nchandle nch, struct ucred cred,
	1854	struct vnode dvp, char fakename)
	1855	{
	1856	struct nlcomponent nlc;
	1857	struct nchandle rncp;
	1858	struct dirent *den;
	1859	struct vnode *pvp;
	1860	struct vattr vat;
	1861	struct iovec iov;
	1862	struct uio uio;
	1863	int blksize;
	1864	int eofflag;
	1865	int bytes;
	1866	char *rbuf;
	1867	int error;
	1868
	1869	vat.va_blocksize = 0;
	1870	if ((error = VOP_GETATTR(dvp, &vat)) != 0)
	1871	return (error);
	1872	cache_lock(nch);
	1873	error = cache_vref(nch, cred, &pvp);
	1874	cache_unlock(nch);
	1875	if (error)
	1876	return (error);
	1877	if (ncvp_debug) {
	1878	kprintf("inefficient_scan: directory iosize %ld "
	1879	"vattr fileid = %lld\n",
	1880	vat.va_blocksize,
	1881	(long long)vat.va_fileid);
	1882	}
	1883
	1884	/*
	1885	* Use the supplied fakename if not NULL. Fake names are typically
	1886	* not in the actual filesystem hierarchy. This is used by HAMMER
	1887	* to glue @@timestamp recursions together.
	1888	*/
	1889	if (fakename) {
	1890	nlc.nlc_nameptr = fakename;
	1891	nlc.nlc_namelen = strlen(fakename);
	1892	rncp = cache_nlookup(nch, &nlc);
	1893	goto done;
	1894	}
	1895
	1896	if ((blksize = vat.va_blocksize) == 0)
	1897	blksize = DEV_BSIZE;
	1898	rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
	1899	rncp.ncp = NULL;
	1900
	1901	eofflag = 0;
	1902	uio.uio_offset = 0;
	1903	again:
	1904	iov.iov_base = rbuf;
	1905	iov.iov_len = blksize;
	1906	uio.uio_iov = &iov;
	1907	uio.uio_iovcnt = 1;
	1908	uio.uio_resid = blksize;
	1909	uio.uio_segflg = UIO_SYSSPACE;
	1910	uio.uio_rw = UIO_READ;
	1911	uio.uio_td = curthread;
	1912
	1913	if (ncvp_debug >= 2)
	1914	kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
	1915	error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
	1916	if (error == 0) {
	1917	den = (struct dirent *)rbuf;
	1918	bytes = blksize - uio.uio_resid;
	1919
	1920	while (bytes > 0) {
	1921	if (ncvp_debug >= 2) {
	1922	kprintf("cache_inefficient_scan: %.s\n",
	1923	den->d_namlen, den->d_namlen,
	1924	den->d_name);
	1925	}
	1926	if (den->d_type != DT_WHT &&
	1927	den->d_ino == vat.va_fileid) {
	1928	if (ncvp_debug) {
	1929	kprintf("cache_inefficient_scan: "
	1930	"MATCHED inode %lld path %s/%.s\n",
	1931	(long long)vat.va_fileid,
	1932	nch->ncp->nc_name,
	1933	den->d_namlen, den->d_namlen,
	1934	den->d_name);
	1935	}
	1936	nlc.nlc_nameptr = den->d_name;
	1937	nlc.nlc_namelen = den->d_namlen;
	1938	rncp = cache_nlookup(nch, &nlc);
	1939	KKASSERT(rncp.ncp != NULL);
	1940	break;
	1941	}
	1942	bytes -= _DIRENT_DIRSIZ(den);
	1943	den = _DIRENT_NEXT(den);
	1944	}
	1945	if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
	1946	goto again;
	1947	}
	1948	kfree(rbuf, M_TEMP);
	1949	done:
	1950	vrele(pvp);
	1951	if (rncp.ncp) {
	1952	if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
	1953	_cache_setvp(rncp.mount, rncp.ncp, dvp);
	1954	if (ncvp_debug >= 2) {
	1955	kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
	1956	nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
	1957	}
	1958	} else {
	1959	if (ncvp_debug >= 2) {
	1960	kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
	1961	nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
	1962	rncp.ncp->nc_vp);
	1963	}
	1964	}
	1965	if (rncp.ncp->nc_vp == NULL)
	1966	error = rncp.ncp->nc_error;
	1967	/*
	1968	* Release rncp after a successful nlookup. rncp was fully
	1969	* referenced.
	1970	*/
	1971	cache_put(&rncp);
	1972	} else {
	1973	kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
	1974	dvp, nch->ncp->nc_name);
	1975	error = ENOENT;
	1976	}
	1977	return (error);
	1978	}
	1979
	1980	/*
	1981	* Zap a namecache entry. The ncp is unconditionally set to an unresolved
	1982	* state, which disassociates it from its vnode or ncneglist.
	1983	*
	1984	* Then, if there are no additional references to the ncp and no children,
	1985	* the ncp is removed from the topology and destroyed.
	1986	*
	1987	* References and/or children may exist if the ncp is in the middle of the
	1988	* topology, preventing the ncp from being destroyed.
	1989	*
	1990	* This function must be called with the ncp held and locked and will unlock
	1991	* and drop it during zapping.
	1992	*
	1993	* If nonblock is non-zero and the parent ncp cannot be locked we give up.
	1994	* This case can occur in the cache_drop() path.
	1995	*
	1996	* This function may returned a held (but NOT locked) parent node which the
	1997	* caller must drop. We do this so _cache_drop() can loop, to avoid
	1998	* blowing out the kernel stack.
	1999	*
	2000	* WARNING! For MPSAFE operation this routine must acquire up to three
	2001	* spin locks to be able to safely test nc_refs. Lock order is
	2002	* very important.
	2003	*
	2004	* hash spinlock if on hash list
	2005	* parent spinlock if child of parent
	2006	* (the ncp is unresolved so there is no vnode association)
	2007	*/
	2008	static struct namecache *
	2009	cache_zap(struct namecache *ncp, int nonblock)
	2010	{
	2011	struct namecache *par;
	2012	struct vnode *dropvp;
	2013	int refs;
	2014
	2015	/*
	2016	* Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
	2017	*/
	2018	_cache_setunresolved(ncp);
	2019
	2020	/*
	2021	* Try to scrap the entry and possibly tail-recurse on its parent.
	2022	* We only scrap unref'd (other then our ref) unresolved entries,
	2023	* we do not scrap 'live' entries.
	2024	*
	2025	* Note that once the spinlocks are acquired if nc_refs == 1 no
	2026	* other references are possible. If it isn't, however, we have
	2027	* to decrement but also be sure to avoid a 1->0 transition.
	2028	*/
	2029	KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
	2030	KKASSERT(ncp->nc_refs > 0);
	2031
	2032	/*
	2033	* Acquire locks. Note that the parent can't go away while we hold
	2034	* a child locked.
	2035	*/
	2036	if ((par = ncp->nc_parent) != NULL) {
	2037	if (nonblock) {
	2038	for (;;) {
	2039	if (_cache_lock_nonblock(par) == 0)
	2040	break;
	2041	refs = ncp->nc_refs;
	2042	ncp->nc_flag \|= NCF_DEFEREDZAP;
	2043	++numdefered; /* MP race ok */
	2044	if (atomic_cmpset_int(&ncp->nc_refs,
	2045	refs, refs - 1)) {
	2046	_cache_unlock(ncp);
	2047	return(NULL);
	2048	}
	2049	cpu_pause();
	2050	}
	2051	_cache_hold(par);
	2052	} else {
	2053	_cache_hold(par);
	2054	_cache_lock(par);
	2055	}
	2056	spin_lock(&ncp->nc_head->spin);
	2057	}
	2058
	2059	/*
	2060	* If someone other then us has a ref or we have children
	2061	* we cannot zap the entry. The 1->0 transition and any
	2062	* further list operation is protected by the spinlocks
	2063	* we have acquired but other transitions are not.
	2064	*/
	2065	for (;;) {
	2066	refs = ncp->nc_refs;
	2067	if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
	2068	break;
	2069	if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
	2070	if (par) {
	2071	spin_unlock(&ncp->nc_head->spin);
	2072	_cache_put(par);
	2073	}
	2074	_cache_unlock(ncp);
	2075	return(NULL);
	2076	}
	2077	cpu_pause();
	2078	}
	2079
	2080	/*
	2081	* We are the only ref and with the spinlocks held no further
	2082	* refs can be acquired by others.
	2083	*
	2084	* Remove us from the hash list and parent list. We have to
	2085	* drop a ref on the parent's vp if the parent's list becomes
	2086	* empty.
	2087	*/
	2088	dropvp = NULL;
	2089	if (par) {
	2090	struct nchash_head *nchpp = ncp->nc_head;
	2091
	2092	KKASSERT(nchpp != NULL);
	2093	LIST_REMOVE(ncp, nc_hash);
	2094	TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
	2095	if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
	2096	dropvp = par->nc_vp;
	2097	ncp->nc_head = NULL;
	2098	ncp->nc_parent = NULL;
	2099	spin_unlock(&nchpp->spin);
	2100	_cache_unlock(par);
	2101	} else {
	2102	KKASSERT(ncp->nc_head == NULL);
	2103	}
	2104
	2105	/*
	2106	* ncp should not have picked up any refs. Physically
	2107	* destroy the ncp.
	2108	*/
	2109	KKASSERT(ncp->nc_refs == 1);
	2110	/* _cache_unlock(ncp) not required */
	2111	ncp->nc_refs = -1; /* safety */
	2112	if (ncp->nc_name)
	2113	kfree(ncp->nc_name, M_VFSCACHE);
	2114	kfree(ncp, M_VFSCACHE);
	2115
	2116	/*
	2117	* Delayed drop (we had to release our spinlocks)
	2118	*
	2119	* The refed parent (if not NULL) must be dropped. The
	2120	* caller is responsible for looping.
	2121	*/
	2122	if (dropvp)
	2123	vdrop(dropvp);
	2124	return(par);
	2125	}
	2126
	2127	/*
	2128	* Clean up dangling negative cache and defered-drop entries in the
	2129	* namecache.
	2130	*/
	2131	typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
	2132
	2133	static cache_hs_t neg_cache_hysteresis_state = CHI_LOW;
	2134	static cache_hs_t pos_cache_hysteresis_state = CHI_LOW;
	2135
	2136	void
	2137	cache_hysteresis(void)
	2138	{
	2139	int poslimit;
	2140
	2141	/*
	2142	* Don't cache too many negative hits. We use hysteresis to reduce
	2143	* the impact on the critical path.
	2144	*/
	2145	switch(neg_cache_hysteresis_state) {
	2146	case CHI_LOW:
	2147	if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
	2148	_cache_cleanneg(10);
	2149	neg_cache_hysteresis_state = CHI_HIGH;
	2150	}
	2151	break;
	2152	case CHI_HIGH:
	2153	if (numneg > MINNEG * 9 / 10 &&
	2154	numneg * ncnegfactor * 9 / 10 > numcache
	2155	) {
	2156	_cache_cleanneg(10);
	2157	} else {
	2158	neg_cache_hysteresis_state = CHI_LOW;
	2159	}
	2160	break;
	2161	}
	2162
	2163	/*
	2164	* Don't cache too many positive hits. We use hysteresis to reduce
	2165	* the impact on the critical path.
	2166	*
	2167	* Excessive positive hits can accumulate due to large numbers of
	2168	* hardlinks (the vnode cache will not prevent hl ncps from growing
	2169	* into infinity).
	2170	*/
	2171	if ((poslimit = ncposlimit) == 0)
	2172	poslimit = desiredvnodes * 2;
	2173
	2174	switch(pos_cache_hysteresis_state) {
	2175	case CHI_LOW:
	2176	if (numcache > poslimit && numcache > MINPOS) {
	2177	_cache_cleanpos(10);
	2178	pos_cache_hysteresis_state = CHI_HIGH;
	2179	}
	2180	break;
	2181	case CHI_HIGH:
	2182	if (numcache > poslimit * 5 / 6 && numcache > MINPOS) {
	2183	_cache_cleanpos(10);
	2184	} else {
	2185	pos_cache_hysteresis_state = CHI_LOW;
	2186	}
	2187	break;
	2188	}
	2189
	2190	/*
	2191	* Clean out dangling defered-zap ncps which could not
	2192	* be cleanly dropped if too many build up. Note
	2193	* that numdefered is not an exact number as such ncps
	2194	* can be reused and the counter is not handled in a MP
	2195	* safe manner by design.
	2196	*/
	2197	if (numdefered * ncnegfactor > numcache) {
	2198	_cache_cleandefered();
	2199	}
	2200	}
	2201
	2202	/*
	2203	* NEW NAMECACHE LOOKUP API
	2204	*
	2205	* Lookup an entry in the namecache. The passed par_nch must be referenced
	2206	* and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp
	2207	* is ALWAYS returned, eve if the supplied component is illegal.
	2208	*
	2209	* The resulting namecache entry should be returned to the system with
	2210	* cache_put() or cache_unlock() + cache_drop().
	2211	*
	2212	* namecache locks are recursive but care must be taken to avoid lock order
	2213	* reversals (hence why the passed par_nch must be unlocked). Locking
	2214	* rules are to order for parent traversals, not for child traversals.
	2215	*
	2216	* Nobody else will be able to manipulate the associated namespace (e.g.
	2217	* create, delete, rename, rename-target) until the caller unlocks the
	2218	* entry.
	2219	*
	2220	* The returned entry will be in one of three states: positive hit (non-null
	2221	* vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
	2222	* Unresolved entries must be resolved through the filesystem to associate the
	2223	* vnode and/or determine whether a positive or negative hit has occured.
	2224	*
	2225	* It is not necessary to lock a directory in order to lock namespace under
	2226	* that directory. In fact, it is explicitly not allowed to do that. A
	2227	* directory is typically only locked when being created, renamed, or
	2228	* destroyed.
	2229	*
	2230	* The directory (par) may be unresolved, in which case any returned child
	2231	* will likely also be marked unresolved. Likely but not guarenteed. Since
	2232	* the filesystem lookup requires a resolved directory vnode the caller is
	2233	* responsible for resolving the namecache chain top-down. This API
	2234	* specifically allows whole chains to be created in an unresolved state.
	2235	*/
	2236	struct nchandle
	2237	cache_nlookup(struct nchandle par_nch, struct nlcomponent nlc)
	2238	{
	2239	struct nchandle nch;
	2240	struct namecache *ncp;
	2241	struct namecache *new_ncp;
	2242	struct nchash_head *nchpp;
	2243	struct mount *mp;
	2244	u_int32_t hash;
	2245	globaldata_t gd;
	2246	int par_locked;
	2247
	2248	numcalls++;
	2249	gd = mycpu;
	2250	mp = par_nch->mount;
	2251	par_locked = 0;
	2252
	2253	/*
	2254	* This is a good time to call it, no ncp's are locked by
	2255	* the caller or us.
	2256	*/
	2257	cache_hysteresis();
	2258
	2259	/*
	2260	* Try to locate an existing entry
	2261	*/
	2262	hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
	2263	hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
	2264	new_ncp = NULL;
	2265	nchpp = NCHHASH(hash);
	2266	restart:
	2267	spin_lock(&nchpp->spin);
	2268	LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
	2269	numchecks++;
	2270
	2271	/*
	2272	* Break out if we find a matching entry. Note that
	2273	* UNRESOLVED entries may match, but DESTROYED entries
	2274	* do not.
	2275	*/
	2276	if (ncp->nc_parent == par_nch->ncp &&
	2277	ncp->nc_nlen == nlc->nlc_namelen &&
	2278	bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
	2279	(ncp->nc_flag & NCF_DESTROYED) == 0
	2280	) {
	2281	_cache_hold(ncp);
	2282	spin_unlock(&nchpp->spin);
	2283	if (par_locked) {
	2284	_cache_unlock(par_nch->ncp);
	2285	par_locked = 0;
	2286	}
	2287	if (_cache_lock_special(ncp) == 0) {
	2288	_cache_auto_unresolve(mp, ncp);
	2289	if (new_ncp)
	2290	_cache_free(new_ncp);
	2291	goto found;
	2292	}
	2293	_cache_get(ncp);
	2294	_cache_put(ncp);
	2295	_cache_drop(ncp);
	2296	goto restart;
	2297	}
	2298	}
	2299
	2300	/*
	2301	* We failed to locate an entry, create a new entry and add it to
	2302	* the cache. The parent ncp must also be locked so we
	2303	* can link into it.
	2304	*
	2305	* We have to relookup after possibly blocking in kmalloc or
	2306	* when locking par_nch.
	2307	*
	2308	* NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
	2309	* mount case, in which case nc_name will be NULL.
	2310	*/
	2311	if (new_ncp == NULL) {
	2312	spin_unlock(&nchpp->spin);
	2313	new_ncp = cache_alloc(nlc->nlc_namelen);
	2314	if (nlc->nlc_namelen) {
	2315	bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
	2316	nlc->nlc_namelen);
	2317	new_ncp->nc_name[nlc->nlc_namelen] = 0;
	2318	}
	2319	goto restart;
	2320	}
	2321	if (par_locked == 0) {
	2322	spin_unlock(&nchpp->spin);
	2323	_cache_lock(par_nch->ncp);
	2324	par_locked = 1;
	2325	goto restart;
	2326	}
	2327
	2328	/*
	2329	* WARNING! We still hold the spinlock. We have to set the hash
	2330	* table entry atomically.
	2331	*/
	2332	ncp = new_ncp;
	2333	_cache_link_parent(ncp, par_nch->ncp, nchpp);
	2334	spin_unlock(&nchpp->spin);
	2335	_cache_unlock(par_nch->ncp);
	2336	/* par_locked = 0 - not used */
	2337	found:
	2338	/*
	2339	* stats and namecache size management
	2340	*/
	2341	if (ncp->nc_flag & NCF_UNRESOLVED)
	2342	++gd->gd_nchstats->ncs_miss;
	2343	else if (ncp->nc_vp)
	2344	++gd->gd_nchstats->ncs_goodhits;
	2345	else
	2346	++gd->gd_nchstats->ncs_neghits;
	2347	nch.mount = mp;
	2348	nch.ncp = ncp;
	2349	atomic_add_int(&nch.mount->mnt_refs, 1);
	2350	return(nch);
	2351	}
	2352
	2353	/*
	2354	* This is a non-blocking verison of cache_nlookup() used by
	2355	* nfs_readdirplusrpc_uio(). It can fail for any reason and
	2356	* will return nch.ncp == NULL in that case.
	2357	*/
	2358	struct nchandle
	2359	cache_nlookup_nonblock(struct nchandle par_nch, struct nlcomponent nlc)
	2360	{
	2361	struct nchandle nch;
	2362	struct namecache *ncp;
	2363	struct namecache *new_ncp;
	2364	struct nchash_head *nchpp;
	2365	struct mount *mp;
	2366	u_int32_t hash;
	2367	globaldata_t gd;
	2368	int par_locked;
	2369
	2370	numcalls++;
	2371	gd = mycpu;
	2372	mp = par_nch->mount;
	2373	par_locked = 0;
	2374
	2375	/*
	2376	* Try to locate an existing entry
	2377	*/
	2378	hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
	2379	hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
	2380	new_ncp = NULL;
	2381	nchpp = NCHHASH(hash);
	2382	restart:
	2383	spin_lock(&nchpp->spin);
	2384	LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
	2385	numchecks++;
	2386
	2387	/*
	2388	* Break out if we find a matching entry. Note that
	2389	* UNRESOLVED entries may match, but DESTROYED entries
	2390	* do not.
	2391	*/
	2392	if (ncp->nc_parent == par_nch->ncp &&
	2393	ncp->nc_nlen == nlc->nlc_namelen &&
	2394	bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
	2395	(ncp->nc_flag & NCF_DESTROYED) == 0
	2396	) {
	2397	_cache_hold(ncp);
	2398	spin_unlock(&nchpp->spin);
	2399	if (par_locked) {
	2400	_cache_unlock(par_nch->ncp);
	2401	par_locked = 0;
	2402	}
	2403	if (_cache_lock_special(ncp) == 0) {
	2404	_cache_auto_unresolve(mp, ncp);
	2405	if (new_ncp) {
	2406	_cache_free(new_ncp);
	2407	new_ncp = NULL;
	2408	}
	2409	goto found;
	2410	}
	2411	_cache_drop(ncp);
	2412	goto failed;
	2413	}
	2414	}
	2415
	2416	/*
	2417	* We failed to locate an entry, create a new entry and add it to
	2418	* the cache. The parent ncp must also be locked so we
	2419	* can link into it.
	2420	*
	2421	* We have to relookup after possibly blocking in kmalloc or
	2422	* when locking par_nch.
	2423	*
	2424	* NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
	2425	* mount case, in which case nc_name will be NULL.
	2426	*/
	2427	if (new_ncp == NULL) {
	2428	spin_unlock(&nchpp->spin);
	2429	new_ncp = cache_alloc(nlc->nlc_namelen);
	2430	if (nlc->nlc_namelen) {
	2431	bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
	2432	nlc->nlc_namelen);
	2433	new_ncp->nc_name[nlc->nlc_namelen] = 0;
	2434	}
	2435	goto restart;
	2436	}
	2437	if (par_locked == 0) {
	2438	spin_unlock(&nchpp->spin);
	2439	if (_cache_lock_nonblock(par_nch->ncp) == 0) {
	2440	par_locked = 1;
	2441	goto restart;
	2442	}
	2443	goto failed;
	2444	}
	2445
	2446	/*
	2447	* WARNING! We still hold the spinlock. We have to set the hash
	2448	* table entry atomically.
	2449	*/
	2450	ncp = new_ncp;
	2451	_cache_link_parent(ncp, par_nch->ncp, nchpp);
	2452	spin_unlock(&nchpp->spin);
	2453	_cache_unlock(par_nch->ncp);
	2454	/* par_locked = 0 - not used */
	2455	found:
	2456	/*
	2457	* stats and namecache size management
	2458	*/
	2459	if (ncp->nc_flag & NCF_UNRESOLVED)
	2460	++gd->gd_nchstats->ncs_miss;
	2461	else if (ncp->nc_vp)
	2462	++gd->gd_nchstats->ncs_goodhits;
	2463	else
	2464	++gd->gd_nchstats->ncs_neghits;
	2465	nch.mount = mp;
	2466	nch.ncp = ncp;
	2467	atomic_add_int(&nch.mount->mnt_refs, 1);
	2468	return(nch);
	2469	failed:
	2470	if (new_ncp) {
	2471	_cache_free(new_ncp);
	2472	new_ncp = NULL;
	2473	}
	2474	nch.mount = NULL;
	2475	nch.ncp = NULL;
	2476	return(nch);
	2477	}
	2478
	2479	/*
	2480	* The namecache entry is marked as being used as a mount point.
	2481	* Locate the mount if it is visible to the caller.
	2482	*/
	2483	struct findmount_info {
	2484	struct mount *result;
	2485	struct mount *nch_mount;
	2486	struct namecache *nch_ncp;
	2487	};
	2488
	2489	static
	2490	int
	2491	cache_findmount_callback(struct mount mp, void data)
	2492	{
	2493	struct findmount_info *info = data;
	2494
	2495	/*
	2496	* Check the mount's mounted-on point against the passed nch.
	2497	*/
	2498	if (mp->mnt_ncmounton.mount == info->nch_mount &&
	2499	mp->mnt_ncmounton.ncp == info->nch_ncp
	2500	) {
	2501	info->result = mp;
	2502	atomic_add_int(&mp->mnt_refs, 1);
	2503	return(-1);
	2504	}
	2505	return(0);
	2506	}
	2507
	2508	struct mount *
	2509	cache_findmount(struct nchandle *nch)
	2510	{
	2511	struct findmount_info info;
	2512
	2513	info.result = NULL;
	2514	info.nch_mount = nch->mount;
	2515	info.nch_ncp = nch->ncp;
	2516	mountlist_scan(cache_findmount_callback, &info,
	2517	MNTSCAN_FORWARD\|MNTSCAN_NOBUSY);
	2518	return(info.result);
	2519	}
	2520
	2521	void
	2522	cache_dropmount(struct mount *mp)
	2523	{
	2524	atomic_add_int(&mp->mnt_refs, -1);
	2525	}
	2526
	2527	/*
	2528	* Resolve an unresolved namecache entry, generally by looking it up.
	2529	* The passed ncp must be locked and refd.
	2530	*
	2531	* Theoretically since a vnode cannot be recycled while held, and since
	2532	* the nc_parent chain holds its vnode as long as children exist, the
	2533	* direct parent of the cache entry we are trying to resolve should
	2534	* have a valid vnode. If not then generate an error that we can
	2535	* determine is related to a resolver bug.
	2536	*
	2537	* However, if a vnode was in the middle of a recyclement when the NCP
	2538	* got locked, ncp->nc_vp might point to a vnode that is about to become
	2539	* invalid. cache_resolve() handles this case by unresolving the entry
	2540	* and then re-resolving it.
	2541	*
	2542	* Note that successful resolution does not necessarily return an error
	2543	* code of 0. If the ncp resolves to a negative cache hit then ENOENT
	2544	* will be returned.
	2545	*
	2546	* MPSAFE
	2547	*/
	2548	int
	2549	cache_resolve(struct nchandle nch, struct ucred cred)
	2550	{
	2551	struct namecache *par_tmp;
	2552	struct namecache *par;
	2553	struct namecache *ncp;
	2554	struct nchandle nctmp;
	2555	struct mount *mp;
	2556	struct vnode *dvp;
	2557	int error;
	2558
	2559	ncp = nch->ncp;
	2560	mp = nch->mount;
	2561	restart:
	2562	/*
	2563	* If the ncp is already resolved we have nothing to do. However,
	2564	* we do want to guarentee that a usable vnode is returned when
	2565	* a vnode is present, so make sure it hasn't been reclaimed.
	2566	*/
	2567	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
	2568	if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
	2569	_cache_setunresolved(ncp);
	2570	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
	2571	return (ncp->nc_error);
	2572	}
	2573
	2574	/*
	2575	* Mount points need special handling because the parent does not
	2576	* belong to the same filesystem as the ncp.
	2577	*/
	2578	if (ncp == mp->mnt_ncmountpt.ncp)
	2579	return (cache_resolve_mp(mp));
	2580
	2581	/*
	2582	* We expect an unbroken chain of ncps to at least the mount point,
	2583	* and even all the way to root (but this code doesn't have to go
	2584	* past the mount point).
	2585	*/
	2586	if (ncp->nc_parent == NULL) {
	2587	kprintf("EXDEV case 1 %p %.s\n", ncp,
	2588	ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
	2589	ncp->nc_error = EXDEV;
	2590	return(ncp->nc_error);
	2591	}
	2592
	2593	/*
	2594	* The vp's of the parent directories in the chain are held via vhold()
	2595	* due to the existance of the child, and should not disappear.
	2596	* However, there are cases where they can disappear:
	2597	*
	2598	* - due to filesystem I/O errors.
	2599	* - due to NFS being stupid about tracking the namespace and
	2600	* destroys the namespace for entire directories quite often.
	2601	* - due to forced unmounts.
	2602	* - due to an rmdir (parent will be marked DESTROYED)
	2603	*
	2604	* When this occurs we have to track the chain backwards and resolve
	2605	* it, looping until the resolver catches up to the current node. We
	2606	* could recurse here but we might run ourselves out of kernel stack
	2607	* so we do it in a more painful manner. This situation really should
	2608	* not occur all that often, or if it does not have to go back too
	2609	* many nodes to resolve the ncp.
	2610	*/
	2611	while ((dvp = cache_dvpref(ncp)) == NULL) {
	2612	/*
	2613	* This case can occur if a process is CD'd into a
	2614	* directory which is then rmdir'd. If the parent is marked
	2615	* destroyed there is no point trying to resolve it.
	2616	*/
	2617	if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
	2618	return(ENOENT);
	2619	par = ncp->nc_parent;
	2620	_cache_hold(par);
	2621	_cache_lock(par);
	2622	while ((par_tmp = par->nc_parent) != NULL &&
	2623	par_tmp->nc_vp == NULL) {
	2624	_cache_hold(par_tmp);
	2625	_cache_lock(par_tmp);
	2626	_cache_put(par);
	2627	par = par_tmp;
	2628	}
	2629	if (par->nc_parent == NULL) {
	2630	kprintf("EXDEV case 2 %.s\n",
	2631	par->nc_nlen, par->nc_nlen, par->nc_name);
	2632	_cache_put(par);
	2633	return (EXDEV);
	2634	}
	2635	kprintf("[diagnostic] cache_resolve: had to recurse on %.s\n",
	2636	par->nc_nlen, par->nc_nlen, par->nc_name);
	2637	/*
	2638	* The parent is not set in stone, ref and lock it to prevent
	2639	* it from disappearing. Also note that due to renames it
	2640	* is possible for our ncp to move and for par to no longer
	2641	* be one of its parents. We resolve it anyway, the loop
	2642	* will handle any moves.
	2643	*/
	2644	_cache_get(par); /* additional hold/lock */
	2645	_cache_put(par); /* from earlier hold/lock */
	2646	if (par == nch->mount->mnt_ncmountpt.ncp) {
	2647	cache_resolve_mp(nch->mount);
	2648	} else if ((dvp = cache_dvpref(par)) == NULL) {
	2649	kprintf("[diagnostic] cache_resolve: raced on %.s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
	2650	_cache_put(par);
	2651	continue;
	2652	} else {
	2653	if (par->nc_flag & NCF_UNRESOLVED) {
	2654	nctmp.mount = mp;
	2655	nctmp.ncp = par;
	2656	par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
	2657	}
	2658	vrele(dvp);
	2659	}
	2660	if ((error = par->nc_error) != 0) {
	2661	if (par->nc_error != EAGAIN) {
	2662	kprintf("EXDEV case 3 %.s error %d\n",
	2663	par->nc_nlen, par->nc_nlen, par->nc_name,
	2664	par->nc_error);
	2665	_cache_put(par);
	2666	return(error);
	2667	}
	2668	kprintf("[diagnostic] cache_resolve: EAGAIN par %p %.s\n",
	2669	par, par->nc_nlen, par->nc_nlen, par->nc_name);
	2670	}
	2671	_cache_put(par);
	2672	/* loop */
	2673	}
	2674
	2675	/*
	2676	* Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
	2677	* ncp's and reattach them. If this occurs the original ncp is marked
	2678	* EAGAIN to force a relookup.
	2679	*
	2680	* NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
	2681	* ncp must already be resolved.
	2682	*/
	2683	if (dvp) {
	2684	nctmp.mount = mp;
	2685	nctmp.ncp = ncp;
	2686	ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
	2687	vrele(dvp);
	2688	} else {
	2689	ncp->nc_error = EPERM;
	2690	}
	2691	if (ncp->nc_error == EAGAIN) {
	2692	kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %.s\n",
	2693	ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
	2694	goto restart;
	2695	}
	2696	return(ncp->nc_error);
	2697	}
	2698
	2699	/*
	2700	* Resolve the ncp associated with a mount point. Such ncp's almost always
	2701	* remain resolved and this routine is rarely called. NFS MPs tends to force
	2702	* re-resolution more often due to its mac-truck-smash-the-namecache
	2703	* method of tracking namespace changes.
	2704	*
	2705	* The semantics for this call is that the passed ncp must be locked on
	2706	* entry and will be locked on return. However, if we actually have to
	2707	* resolve the mount point we temporarily unlock the entry in order to
	2708	* avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of
	2709	* the unlock we have to recheck the flags after we relock.
	2710	*/
	2711	static int
	2712	cache_resolve_mp(struct mount *mp)
	2713	{
	2714	struct namecache *ncp = mp->mnt_ncmountpt.ncp;
	2715	struct vnode *vp;
	2716	int error;
	2717
	2718	KKASSERT(mp != NULL);
	2719
	2720	/*
	2721	* If the ncp is already resolved we have nothing to do. However,
	2722	* we do want to guarentee that a usable vnode is returned when
	2723	* a vnode is present, so make sure it hasn't been reclaimed.
	2724	*/
	2725	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
	2726	if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
	2727	_cache_setunresolved(ncp);
	2728	}
	2729
	2730	if (ncp->nc_flag & NCF_UNRESOLVED) {
	2731	_cache_unlock(ncp);
	2732	while (vfs_busy(mp, 0))
	2733	;
	2734	error = VFS_ROOT(mp, &vp);
	2735	_cache_lock(ncp);
	2736
	2737	/*
	2738	* recheck the ncp state after relocking.
	2739	*/
	2740	if (ncp->nc_flag & NCF_UNRESOLVED) {
	2741	ncp->nc_error = error;
	2742	if (error == 0) {
	2743	_cache_setvp(mp, ncp, vp);
	2744	vput(vp);
	2745	} else {
	2746	kprintf("[diagnostic] cache_resolve_mp: failed"
	2747	" to resolve mount %p err=%d ncp=%p\n",
	2748	mp, error, ncp);
	2749	_cache_setvp(mp, ncp, NULL);
	2750	}
	2751	} else if (error == 0) {
	2752	vput(vp);
	2753	}
	2754	vfs_unbusy(mp);
	2755	}
	2756	return(ncp->nc_error);
	2757	}
	2758
	2759	/*
	2760	* Clean out negative cache entries when too many have accumulated.
	2761	*
	2762	* MPSAFE
	2763	*/
	2764	static void
	2765	_cache_cleanneg(int count)
	2766	{
	2767	struct namecache *ncp;
	2768
	2769	/*
	2770	* Attempt to clean out the specified number of negative cache
	2771	* entries.
	2772	*/
	2773	while (count) {
	2774	spin_lock(&ncspin);
	2775	ncp = TAILQ_FIRST(&ncneglist);
	2776	if (ncp == NULL) {
	2777	spin_unlock(&ncspin);
	2778	break;
	2779	}
	2780	TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
	2781	TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
	2782	_cache_hold(ncp);
	2783	spin_unlock(&ncspin);
	2784	if (_cache_lock_special(ncp) == 0) {
	2785	ncp = cache_zap(ncp, 1);
	2786	if (ncp)
	2787	_cache_drop(ncp);
	2788	} else {
	2789	_cache_drop(ncp);
	2790	}
	2791	--count;
	2792	}
	2793	}
	2794
	2795	/*
	2796	* Clean out positive cache entries when too many have accumulated.
	2797	*
	2798	* MPSAFE
	2799	*/
	2800	static void
	2801	_cache_cleanpos(int count)
	2802	{
	2803	static volatile int rover;
	2804	struct nchash_head *nchpp;
	2805	struct namecache *ncp;
	2806	int rover_copy;
	2807
	2808	/*
	2809	* Attempt to clean out the specified number of negative cache
	2810	* entries.
	2811	*/
	2812	while (count) {
	2813	rover_copy = ++rover; /* MPSAFEENOUGH */
	2814	cpu_ccfence();
	2815	nchpp = NCHHASH(rover_copy);
	2816
	2817	spin_lock(&nchpp->spin);
	2818	ncp = LIST_FIRST(&nchpp->list);
	2819	if (ncp)
	2820	_cache_hold(ncp);
	2821	spin_unlock(&nchpp->spin);
	2822
	2823	if (ncp) {
	2824	if (_cache_lock_special(ncp) == 0) {
	2825	ncp = cache_zap(ncp, 1);
	2826	if (ncp)
	2827	_cache_drop(ncp);
	2828	} else {
	2829	_cache_drop(ncp);
	2830	}
	2831	}
	2832	--count;
	2833	}
	2834	}
	2835
	2836	/*
	2837	* This is a kitchen sink function to clean out ncps which we
	2838	* tried to zap from cache_drop() but failed because we were
	2839	* unable to acquire the parent lock.
	2840	*
	2841	* Such entries can also be removed via cache_inval_vp(), such
	2842	* as when unmounting.
	2843	*
	2844	* MPSAFE
	2845	*/
	2846	static void
	2847	_cache_cleandefered(void)
	2848	{
	2849	struct nchash_head *nchpp;
	2850	struct namecache *ncp;
	2851	struct namecache dummy;
	2852	int i;
	2853
	2854	numdefered = 0;
	2855	bzero(&dummy, sizeof(dummy));
	2856	dummy.nc_flag = NCF_DESTROYED;
	2857
	2858	for (i = 0; i <= nchash; ++i) {
	2859	nchpp = &nchashtbl[i];
	2860
	2861	spin_lock(&nchpp->spin);
	2862	LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
	2863	ncp = &dummy;
	2864	while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) {
	2865	if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
	2866	continue;
	2867	LIST_REMOVE(&dummy, nc_hash);
	2868	LIST_INSERT_AFTER(ncp, &dummy, nc_hash);
	2869	_cache_hold(ncp);
	2870	spin_unlock(&nchpp->spin);
	2871	if (_cache_lock_nonblock(ncp) == 0) {
	2872	ncp->nc_flag &= ~NCF_DEFEREDZAP;
	2873	_cache_unlock(ncp);
	2874	}
	2875	_cache_drop(ncp);
	2876	spin_lock(&nchpp->spin);
	2877	ncp = &dummy;
	2878	}
	2879	LIST_REMOVE(&dummy, nc_hash);
	2880	spin_unlock(&nchpp->spin);
	2881	}
	2882	}
	2883
	2884	/*
	2885	* Name cache initialization, from vfsinit() when we are booting
	2886	*/
	2887	void
	2888	nchinit(void)
	2889	{
	2890	int i;
	2891	globaldata_t gd;
	2892
	2893	/* initialise per-cpu namecache effectiveness statistics. */
	2894	for (i = 0; i < ncpus; ++i) {
	2895	gd = globaldata_find(i);
	2896	gd->gd_nchstats = &nchstats[i];
	2897	}
	2898	TAILQ_INIT(&ncneglist);
	2899	spin_init(&ncspin);
	2900	nchashtbl = hashinit_ext(desiredvnodes / 2,
	2901	sizeof(struct nchash_head),
	2902	M_VFSCACHE, &nchash);
	2903	for (i = 0; i <= (int)nchash; ++i) {
	2904	LIST_INIT(&nchashtbl[i].list);
	2905	spin_init(&nchashtbl[i].spin);
	2906	}
	2907	nclockwarn = 5 * hz;
	2908	}
	2909
	2910	/*
	2911	* Called from start_init() to bootstrap the root filesystem. Returns
	2912	* a referenced, unlocked namecache record.
	2913	*/
	2914	void
	2915	cache_allocroot(struct nchandle nch, struct mount mp, struct vnode *vp)
	2916	{
	2917	nch->ncp = cache_alloc(0);
	2918	nch->mount = mp;
	2919	atomic_add_int(&mp->mnt_refs, 1);
	2920	if (vp)
	2921	_cache_setvp(nch->mount, nch->ncp, vp);
	2922	}
	2923
	2924	/*
	2925	* vfs_cache_setroot()
	2926	*
	2927	* Create an association between the root of our namecache and
	2928	* the root vnode. This routine may be called several times during
	2929	* booting.
	2930	*
	2931	* If the caller intends to save the returned namecache pointer somewhere
	2932	* it must cache_hold() it.
	2933	*/
	2934	void
	2935	vfs_cache_setroot(struct vnode nvp, struct nchandle nch)
	2936	{
	2937	struct vnode *ovp;
	2938	struct nchandle onch;
	2939
	2940	ovp = rootvnode;
	2941	onch = rootnch;
	2942	rootvnode = nvp;
	2943	if (nch)
	2944	rootnch = *nch;
	2945	else
	2946	cache_zero(&rootnch);
	2947	if (ovp)
	2948	vrele(ovp);
	2949	if (onch.ncp)
	2950	cache_drop(&onch);
	2951	}
	2952
	2953	/*
	2954	* XXX OLD API COMPAT FUNCTION. This really messes up the new namecache
	2955	* topology and is being removed as quickly as possible. The new VOP_N*()
	2956	* API calls are required to make specific adjustments using the supplied
	2957	* ncp pointers rather then just bogusly purging random vnodes.
	2958	*
	2959	* Invalidate all namecache entries to a particular vnode as well as
	2960	* any direct children of that vnode in the namecache. This is a
	2961	* 'catch all' purge used by filesystems that do not know any better.
	2962	*
	2963	* Note that the linkage between the vnode and its namecache entries will
	2964	* be removed, but the namecache entries themselves might stay put due to
	2965	* active references from elsewhere in the system or due to the existance of
	2966	* the children. The namecache topology is left intact even if we do not
	2967	* know what the vnode association is. Such entries will be marked
	2968	* NCF_UNRESOLVED.
	2969	*/
	2970	void
	2971	cache_purge(struct vnode *vp)
	2972	{
	2973	cache_inval_vp(vp, CINV_DESTROY \| CINV_CHILDREN);
	2974	}
	2975
	2976	/*
	2977	* Flush all entries referencing a particular filesystem.
	2978	*
	2979	* Since we need to check it anyway, we will flush all the invalid
	2980	* entries at the same time.
	2981	*/
	2982	#if 0
	2983
	2984	void
	2985	cache_purgevfs(struct mount *mp)
	2986	{
	2987	struct nchash_head *nchpp;
	2988	struct namecache ncp, nnp;
	2989
	2990	/*
	2991	* Scan hash tables for applicable entries.
	2992	*/
	2993	for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
	2994	spin_lock_wr(&nchpp->spin); XXX
	2995	ncp = LIST_FIRST(&nchpp->list);
	2996	if (ncp)
	2997	_cache_hold(ncp);
	2998	while (ncp) {
	2999	nnp = LIST_NEXT(ncp, nc_hash);
	3000	if (nnp)
	3001	_cache_hold(nnp);
	3002	if (ncp->nc_mount == mp) {
	3003	_cache_lock(ncp);
	3004	ncp = cache_zap(ncp, 0);
	3005	if (ncp)
	3006	_cache_drop(ncp);
	3007	} else {
	3008	_cache_drop(ncp);
	3009	}
	3010	ncp = nnp;
	3011	}
	3012	spin_unlock_wr(&nchpp->spin); XXX
	3013	}
	3014	}
	3015
	3016	#endif
	3017
	3018	static int disablecwd;
	3019	SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
	3020	"Disable getcwd");
	3021
	3022	static u_long numcwdcalls;
	3023	SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
	3024	"Number of current directory resolution calls");
	3025	static u_long numcwdfailnf;
	3026	SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
	3027	"Number of current directory failures due to lack of file");
	3028	static u_long numcwdfailsz;
	3029	SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
	3030	"Number of current directory failures due to large result");
	3031	static u_long numcwdfound;
	3032	SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
	3033	"Number of current directory resolution successes");
	3034
	3035	/*
	3036	* MPALMOSTSAFE
	3037	*/
	3038	int
	3039	sys___getcwd(struct __getcwd_args *uap)
	3040	{
	3041	u_int buflen;
	3042	int error;
	3043	char *buf;
	3044	char *bp;
	3045
	3046	if (disablecwd)
	3047	return (ENODEV);
	3048
	3049	buflen = uap->buflen;
	3050	if (buflen == 0)
	3051	return (EINVAL);
	3052	if (buflen > MAXPATHLEN)
	3053	buflen = MAXPATHLEN;
	3054
	3055	buf = kmalloc(buflen, M_TEMP, M_WAITOK);
	3056	get_mplock();
	3057	bp = kern_getcwd(buf, buflen, &error);
	3058	rel_mplock();
	3059	if (error == 0)
	3060	error = copyout(bp, uap->buf, strlen(bp) + 1);
	3061	kfree(buf, M_TEMP);
	3062	return (error);
	3063	}
	3064
	3065	char *
	3066	kern_getcwd(char buf, size_t buflen, int error)
	3067	{
	3068	struct proc *p = curproc;
	3069	char *bp;
	3070	int i, slash_prefixed;
	3071	struct filedesc *fdp;
	3072	struct nchandle nch;
	3073	struct namecache *ncp;
	3074
	3075	numcwdcalls++;
	3076	bp = buf;
	3077	bp += buflen - 1;
	3078	*bp = '\0';
	3079	fdp = p->p_fd;
	3080	slash_prefixed = 0;
	3081
	3082	nch = fdp->fd_ncdir;
	3083	ncp = nch.ncp;
	3084	if (ncp)
	3085	_cache_hold(ncp);
	3086
	3087	while (ncp && (ncp != fdp->fd_nrdir.ncp \|\|
	3088	nch.mount != fdp->fd_nrdir.mount)
	3089	) {
	3090	/*
	3091	* While traversing upwards if we encounter the root
	3092	* of the current mount we have to skip to the mount point
	3093	* in the underlying filesystem.
	3094	*/
	3095	if (ncp == nch.mount->mnt_ncmountpt.ncp) {
	3096	nch = nch.mount->mnt_ncmounton;
	3097	_cache_drop(ncp);
	3098	ncp = nch.ncp;
	3099	if (ncp)
	3100	_cache_hold(ncp);
	3101	continue;
	3102	}
	3103
	3104	/*
	3105	* Prepend the path segment
	3106	*/
	3107	for (i = ncp->nc_nlen - 1; i >= 0; i--) {
	3108	if (bp == buf) {
	3109	numcwdfailsz++;
	3110	*error = ERANGE;
	3111	bp = NULL;
	3112	goto done;
	3113	}
	3114	*--bp = ncp->nc_name[i];
	3115	}
	3116	if (bp == buf) {
	3117	numcwdfailsz++;
	3118	*error = ERANGE;
	3119	bp = NULL;
	3120	goto done;
	3121	}
	3122	*--bp = '/';
	3123	slash_prefixed = 1;
	3124
	3125	/*
	3126	* Go up a directory. This isn't a mount point so we don't
	3127	* have to check again.
	3128	*/
	3129	while ((nch.ncp = ncp->nc_parent) != NULL) {
	3130	_cache_lock(ncp);
	3131	if (nch.ncp != ncp->nc_parent) {
	3132	_cache_unlock(ncp);
	3133	continue;
	3134	}
	3135	_cache_hold(nch.ncp);
	3136	_cache_unlock(ncp);
	3137	break;
	3138	}
	3139	_cache_drop(ncp);
	3140	ncp = nch.ncp;
	3141	}
	3142	if (ncp == NULL) {
	3143	numcwdfailnf++;
	3144	*error = ENOENT;
	3145	bp = NULL;
	3146	goto done;
	3147	}
	3148	if (!slash_prefixed) {
	3149	if (bp == buf) {
	3150	numcwdfailsz++;
	3151	*error = ERANGE;
	3152	bp = NULL;
	3153	goto done;
	3154	}
	3155	*--bp = '/';
	3156	}
	3157	numcwdfound++;
	3158	*error = 0;
	3159	done:
	3160	if (ncp)
	3161	_cache_drop(ncp);
	3162	return (bp);
	3163	}
	3164
	3165	/*
	3166	* Thus begins the fullpath magic.
	3167	*
	3168	* The passed nchp is referenced but not locked.
	3169	*/
	3170	static int disablefullpath;
	3171	SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
	3172	&disablefullpath, 0,
	3173	"Disable fullpath lookups");
	3174
	3175	static u_int numfullpathcalls;
	3176	SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD,
	3177	&numfullpathcalls, 0,
	3178	"Number of full path resolutions in progress");
	3179	static u_int numfullpathfailnf;
	3180	SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD,
	3181	&numfullpathfailnf, 0,
	3182	"Number of full path resolution failures due to lack of file");
	3183	static u_int numfullpathfailsz;
	3184	SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD,
	3185	&numfullpathfailsz, 0,
	3186	"Number of full path resolution failures due to insufficient memory");
	3187	static u_int numfullpathfound;
	3188	SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD,
	3189	&numfullpathfound, 0,
	3190	"Number of full path resolution successes");
	3191
	3192	int
	3193	cache_fullpath(struct proc p, struct nchandle nchp,
	3194	char retbuf, char freebuf, int guess)
	3195	{
	3196	struct nchandle fd_nrdir;
	3197	struct nchandle nch;
	3198	struct namecache *ncp;
	3199	struct mount mp, new_mp;
	3200	char bp, buf;
	3201	int slash_prefixed;
	3202	int error = 0;
	3203	int i;
	3204
	3205	atomic_add_int(&numfullpathcalls, -1);
	3206
	3207	*retbuf = NULL;
	3208	*freebuf = NULL;
	3209
	3210	buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
	3211	bp = buf + MAXPATHLEN - 1;
	3212	*bp = '\0';
	3213	if (p != NULL)
	3214	fd_nrdir = p->p_fd->fd_nrdir;
	3215	else
	3216	fd_nrdir = rootnch;
	3217	slash_prefixed = 0;
	3218	nch = *nchp;
	3219	ncp = nch.ncp;
	3220	if (ncp)
	3221	_cache_hold(ncp);
	3222	mp = nch.mount;
	3223
	3224	while (ncp && (ncp != fd_nrdir.ncp \|\| mp != fd_nrdir.mount)) {
	3225	new_mp = NULL;
	3226
	3227	/*
	3228	* If we are asked to guess the upwards path, we do so whenever
	3229	* we encounter an ncp marked as a mountpoint. We try to find
	3230	* the actual mountpoint by finding the mountpoint with this ncp.
	3231	*/
	3232	if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
	3233	new_mp = mount_get_by_nc(ncp);
	3234	}
	3235	/*
	3236	* While traversing upwards if we encounter the root
	3237	* of the current mount we have to skip to the mount point.
	3238	*/
	3239	if (ncp == mp->mnt_ncmountpt.ncp) {
	3240	new_mp = mp;
	3241	}
	3242	if (new_mp) {
	3243	nch = new_mp->mnt_ncmounton;
	3244	_cache_drop(ncp);
	3245	ncp = nch.ncp;
	3246	if (ncp)
	3247	_cache_hold(ncp);
	3248	mp = nch.mount;
	3249	continue;
	3250	}
	3251
	3252	/*
	3253	* Prepend the path segment
	3254	*/
	3255	for (i = ncp->nc_nlen - 1; i >= 0; i--) {
	3256	if (bp == buf) {
	3257	numfullpathfailsz++;
	3258	kfree(buf, M_TEMP);
	3259	error = ENOMEM;
	3260	goto done;
	3261	}
	3262	*--bp = ncp->nc_name[i];
	3263	}
	3264	if (bp == buf) {
	3265	numfullpathfailsz++;
	3266	kfree(buf, M_TEMP);
	3267	error = ENOMEM;
	3268	goto done;
	3269	}
	3270	*--bp = '/';
	3271	slash_prefixed = 1;
	3272
	3273	/*
	3274	* Go up a directory. This isn't a mount point so we don't
	3275	* have to check again.
	3276	*
	3277	* We can only safely access nc_parent with ncp held locked.
	3278	*/
	3279	while ((nch.ncp = ncp->nc_parent) != NULL) {
	3280	_cache_lock(ncp);
	3281	if (nch.ncp != ncp->nc_parent) {
	3282	_cache_unlock(ncp);
	3283	continue;
	3284	}
	3285	_cache_hold(nch.ncp);
	3286	_cache_unlock(ncp);
	3287	break;
	3288	}
	3289	_cache_drop(ncp);
	3290	ncp = nch.ncp;
	3291	}
	3292	if (ncp == NULL) {
	3293	numfullpathfailnf++;
	3294	kfree(buf, M_TEMP);
	3295	error = ENOENT;
	3296	goto done;
	3297	}
	3298
	3299	if (!slash_prefixed) {
	3300	if (bp == buf) {
	3301	numfullpathfailsz++;
	3302	kfree(buf, M_TEMP);
	3303	error = ENOMEM;
	3304	goto done;
	3305	}
	3306	*--bp = '/';
	3307	}
	3308	numfullpathfound++;
	3309	*retbuf = bp;
	3310	*freebuf = buf;
	3311	error = 0;
	3312	done:
	3313	if (ncp)
	3314	_cache_drop(ncp);
	3315	return(error);
	3316	}
	3317
	3318	int
	3319	vn_fullpath(struct proc p, struct vnode vn, char retbuf, char freebuf,
	3320	int guess)
	3321	{
	3322	struct namecache *ncp;
	3323	struct nchandle nch;
	3324	int error;
	3325
	3326	*freebuf = NULL;
	3327	atomic_add_int(&numfullpathcalls, 1);
	3328	if (disablefullpath)
	3329	return (ENODEV);
	3330
	3331	if (p == NULL)
	3332	return (EINVAL);
	3333
	3334	/* vn is NULL, client wants us to use p->p_textvp */
	3335	if (vn == NULL) {
	3336	if ((vn = p->p_textvp) == NULL)
	3337	return (EINVAL);
	3338	}
	3339	spin_lock(&vn->v_spin);
	3340	TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
	3341	if (ncp->nc_nlen)
	3342	break;
	3343	}
	3344	if (ncp == NULL) {
	3345	spin_unlock(&vn->v_spin);
	3346	return (EINVAL);
	3347	}
	3348	_cache_hold(ncp);
	3349	spin_unlock(&vn->v_spin);
	3350
	3351	atomic_add_int(&numfullpathcalls, -1);
	3352	nch.ncp = ncp;;
	3353	nch.mount = vn->v_mount;
	3354	error = cache_fullpath(p, &nch, retbuf, freebuf, guess);
	3355	_cache_drop(ncp);
	3356	return (error);
	3357	}