kernel - Add many sysctl definitions, sysv, vfs, nfs, etc.
[dragonfly.git] / sys / kern / vfs_cache.c
CommitLineData
984263bc 1/*
2247fe02 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved.
8c10bfcf
MD
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
984263bc
MD
34 * Copyright (c) 1989, 1993, 1995
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * Poul-Henning Kamp of the FreeBSD Project.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 * 3. All advertising materials mentioning features or use of this software
49 * must display the following acknowledgement:
50 * This product includes software developed by the University of
51 * California, Berkeley and its contributors.
52 * 4. Neither the name of the University nor the names of its contributors
53 * may be used to endorse or promote products derived from this software
54 * without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66 * SUCH DAMAGE.
984263bc
MD
67 */
68
69#include <sys/param.h>
70#include <sys/systm.h>
71#include <sys/kernel.h>
72#include <sys/sysctl.h>
73#include <sys/mount.h>
74#include <sys/vnode.h>
984263bc
MD
75#include <sys/malloc.h>
76#include <sys/sysproto.h>
f63911bf 77#include <sys/spinlock.h>
984263bc 78#include <sys/proc.h>
dadab5e9 79#include <sys/namei.h>
690a3127 80#include <sys/nlookup.h>
984263bc
MD
81#include <sys/filedesc.h>
82#include <sys/fnv_hash.h>
24e51f36 83#include <sys/globaldata.h>
63f58b90 84#include <sys/kern_syscall.h>
fad57d0e 85#include <sys/dirent.h>
8c361dda 86#include <ddb/ddb.h>
984263bc 87
3c37c940 88#include <sys/sysref2.h>
f63911bf 89#include <sys/spinlock2.h>
684a93c4 90#include <sys/mplock2.h>
3c37c940 91
bf40a153
MD
92#define MAX_RECURSION_DEPTH 64
93
984263bc 94/*
7ea21ed1 95 * Random lookups in the cache are accomplished with a hash table using
2247fe02
MD
96 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock.
97 *
98 * Negative entries may exist and correspond to resolved namecache
99 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT
100 * will be set if the entry corresponds to a whited-out directory entry
101 * (verses simply not finding the entry at all). ncneglist is locked
102 * with a global spinlock (ncspin).
103 *
104 * MPSAFE RULES:
105 *
106 * (1) A ncp must be referenced before it can be locked.
107 *
108 * (2) A ncp must be locked in order to modify it.
109 *
110 * (3) ncp locks are always ordered child -> parent. That may seem
111 * backwards but forward scans use the hash table and thus can hold
112 * the parent unlocked when traversing downward.
984263bc 113 *
2247fe02
MD
114 * This allows insert/rename/delete/dot-dot and other operations
115 * to use ncp->nc_parent links.
984263bc 116 *
2247fe02
MD
117 * This also prevents a locked up e.g. NFS node from creating a
118 * chain reaction all the way back to the root vnode / namecache.
119 *
120 * (4) parent linkages require both the parent and child to be locked.
984263bc
MD
121 */
122
123/*
124 * Structures associated with name cacheing.
125 */
8987aad7 126#define NCHHASH(hash) (&nchashtbl[(hash) & nchash])
f517a1bb 127#define MINNEG 1024
9e10d70b 128#define MINPOS 1024
8987aad7 129
24e51f36
HP
130MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
131
f63911bf
MD
132LIST_HEAD(nchash_list, namecache);
133
134struct nchash_head {
65870584
MD
135 struct nchash_list list;
136 struct spinlock spin;
f63911bf
MD
137};
138
139static struct nchash_head *nchashtbl;
140static struct namecache_list ncneglist;
141static struct spinlock ncspin;
8987aad7 142
fad57d0e
MD
143/*
144 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server
145 * to create the namecache infrastructure leading to a dangling vnode.
146 *
147 * 0 Only errors are reported
148 * 1 Successes are reported
149 * 2 Successes + the whole directory scan is reported
150 * 3 Force the directory scan code run as if the parent vnode did not
151 * have a namecache record, even if it does have one.
152 */
153static int ncvp_debug;
0c52fa62
SG
154SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
155 "Namecache debug level (0-3)");
fad57d0e 156
984263bc 157static u_long nchash; /* size of hash table */
0c52fa62
SG
158SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
159 "Size of namecache hash table");
8987aad7 160
f63911bf 161static int ncnegfactor = 16; /* ratio of negative entries */
0c52fa62
SG
162SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
163 "Ratio of namecache negative entries");
8987aad7 164
fc21741a 165static int nclockwarn; /* warn on locked entries in ticks */
0c52fa62
SG
166SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
167 "Warn on locked namecache entries in ticks");
fc21741a 168
65870584 169static int numdefered; /* number of cache entries allocated */
0c52fa62
SG
170SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
171 "Number of cache entries allocated");
65870584 172
9e10d70b 173static int ncposlimit; /* number of cache entries allocated */
0c52fa62
SG
174SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
175 "Number of cache entries allocated");
9e10d70b 176
0c52fa62
SG
177SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
178 "sizeof(struct vnode)");
179SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
180 "sizeof(struct namecache)");
984263bc 181
28623bf9 182static int cache_resolve_mp(struct mount *mp);
5312fa43 183static struct vnode *cache_dvpref(struct namecache *ncp);
28623bf9
MD
184static void _cache_lock(struct namecache *ncp);
185static void _cache_setunresolved(struct namecache *ncp);
65870584 186static void _cache_cleanneg(int count);
9e10d70b 187static void _cache_cleanpos(int count);
65870584 188static void _cache_cleandefered(void);
646a1cda 189
984263bc
MD
190/*
191 * The new name cache statistics
192 */
193SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
093e85dc
SG
194static int numneg;
195SYSCTL_ULONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
196 "Number of negative namecache entries");
197static int numcache;
198SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
199 "Number of namecaches entries");
200static u_long numcalls;
201SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0,
202 "Number of namecache lookups");
203static u_long numchecks;
204SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0,
205 "Number of checked entries in namecache lookups");
984263bc 206
24e51f36
HP
207struct nchstats nchstats[SMP_MAXCPU];
208/*
209 * Export VFS cache effectiveness statistics to user-land.
210 *
211 * The statistics are left for aggregation to user-land so
212 * neat things can be achieved, like observing per-CPU cache
213 * distribution.
214 */
215static int
3736bb9b 216sysctl_nchstats(SYSCTL_HANDLER_ARGS)
24e51f36
HP
217{
218 struct globaldata *gd;
219 int i, error;
220
221 error = 0;
222 for (i = 0; i < ncpus; ++i) {
223 gd = globaldata_find(i);
224 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
225 sizeof(struct nchstats))))
226 break;
227 }
984263bc 228
24e51f36
HP
229 return (error);
230}
231SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
3736bb9b 232 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
984263bc 233
65870584 234static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
f63911bf
MD
235
236/*
237 * Namespace locking. The caller must already hold a reference to the
238 * namecache structure in order to lock/unlock it. This function prevents
239 * the namespace from being created or destroyed by accessors other then
240 * the lock holder.
241 *
242 * Note that holding a locked namecache structure prevents other threads
243 * from making namespace changes (e.g. deleting or creating), prevents
244 * vnode association state changes by other threads, and prevents the
245 * namecache entry from being resolved or unresolved by other threads.
246 *
247 * The lock owner has full authority to associate/disassociate vnodes
248 * and resolve/unresolve the locked ncp.
249 *
2247fe02
MD
250 * The primary lock field is nc_exlocks. nc_locktd is set after the
251 * fact (when locking) or cleared prior to unlocking.
252 *
f63911bf
MD
253 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed
254 * or recycled, but it does NOT help you if the vnode had already
255 * initiated a recyclement. If this is important, use cache_get()
256 * rather then cache_lock() (and deal with the differences in the
257 * way the refs counter is handled). Or, alternatively, make an
258 * unconditional call to cache_validate() or cache_resolve()
259 * after cache_lock() returns.
2247fe02
MD
260 *
261 * MPSAFE
f63911bf
MD
262 */
263static
264void
265_cache_lock(struct namecache *ncp)
266{
267 thread_t td;
f63911bf
MD
268 int didwarn;
269 int error;
2247fe02 270 u_int count;
f63911bf
MD
271
272 KKASSERT(ncp->nc_refs != 0);
273 didwarn = 0;
274 td = curthread;
275
276 for (;;) {
2247fe02 277 count = ncp->nc_exlocks;
f63911bf 278
2247fe02
MD
279 if (count == 0) {
280 if (atomic_cmpset_int(&ncp->nc_exlocks, 0, 1)) {
f63911bf
MD
281 /*
282 * The vp associated with a locked ncp must
283 * be held to prevent it from being recycled.
284 *
285 * WARNING! If VRECLAIMED is set the vnode
286 * could already be in the middle of a recycle.
287 * Callers must use cache_vref() or
288 * cache_vget() on the locked ncp to
289 * validate the vp or set the cache entry
290 * to unresolved.
2247fe02
MD
291 *
292 * NOTE! vhold() is allowed if we hold a
293 * lock on the ncp (which we do).
f63911bf 294 */
2247fe02 295 ncp->nc_locktd = td;
f63911bf
MD
296 if (ncp->nc_vp)
297 vhold(ncp->nc_vp); /* MPSAFE */
298 break;
299 }
2247fe02
MD
300 /* cmpset failed */
301 continue;
302 }
303 if (ncp->nc_locktd == td) {
304 if (atomic_cmpset_int(&ncp->nc_exlocks, count,
305 count + 1)) {
306 break;
307 }
308 /* cmpset failed */
f63911bf
MD
309 continue;
310 }
f63911bf 311 tsleep_interlock(ncp, 0);
2247fe02
MD
312 if (atomic_cmpset_int(&ncp->nc_exlocks, count,
313 count | NC_EXLOCK_REQ) == 0) {
314 /* cmpset failed */
f63911bf 315 continue;
2247fe02 316 }
f63911bf
MD
317 error = tsleep(ncp, PINTERLOCKED, "clock", nclockwarn);
318 if (error == EWOULDBLOCK) {
2247fe02
MD
319 if (didwarn == 0) {
320 didwarn = ticks;
321 kprintf("[diagnostic] cache_lock: blocked "
322 "on %p",
323 ncp);
324 kprintf(" \"%*.*s\"\n",
325 ncp->nc_nlen, ncp->nc_nlen,
326 ncp->nc_name);
327 }
f63911bf
MD
328 }
329 }
2247fe02
MD
330 if (didwarn) {
331 kprintf("[diagnostic] cache_lock: unblocked %*.*s after "
332 "%d secs\n",
333 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
334 (int)(ticks - didwarn) / hz);
f63911bf
MD
335 }
336}
337
2247fe02 338/*
65870584
MD
339 * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
340 * such as the case where one of its children is locked.
341 *
2247fe02
MD
342 * MPSAFE
343 */
f63911bf
MD
344static
345int
346_cache_lock_nonblock(struct namecache *ncp)
347{
348 thread_t td;
2247fe02 349 u_int count;
f63911bf 350
f63911bf
MD
351 td = curthread;
352
353 for (;;) {
2247fe02 354 count = ncp->nc_exlocks;
f63911bf 355
2247fe02
MD
356 if (count == 0) {
357 if (atomic_cmpset_int(&ncp->nc_exlocks, 0, 1)) {
f63911bf
MD
358 /*
359 * The vp associated with a locked ncp must
360 * be held to prevent it from being recycled.
361 *
362 * WARNING! If VRECLAIMED is set the vnode
363 * could already be in the middle of a recycle.
364 * Callers must use cache_vref() or
365 * cache_vget() on the locked ncp to
366 * validate the vp or set the cache entry
367 * to unresolved.
2247fe02
MD
368 *
369 * NOTE! vhold() is allowed if we hold a
370 * lock on the ncp (which we do).
f63911bf 371 */
2247fe02 372 ncp->nc_locktd = td;
f63911bf
MD
373 if (ncp->nc_vp)
374 vhold(ncp->nc_vp); /* MPSAFE */
375 break;
376 }
2247fe02
MD
377 /* cmpset failed */
378 continue;
379 }
380 if (ncp->nc_locktd == td) {
381 if (atomic_cmpset_int(&ncp->nc_exlocks, count,
382 count + 1)) {
383 break;
384 }
385 /* cmpset failed */
f63911bf
MD
386 continue;
387 }
388 return(EWOULDBLOCK);
389 }
390 return(0);
391}
392
393/*
394 * Helper function
395 *
396 * NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
2247fe02 397 *
65870584 398 * nc_locktd must be NULLed out prior to nc_exlocks getting cleared.
2247fe02
MD
399 *
400 * MPSAFE
f63911bf
MD
401 */
402static
403void
404_cache_unlock(struct namecache *ncp)
405{
406 thread_t td __debugvar = curthread;
2247fe02 407 u_int count;
f63911bf
MD
408
409 KKASSERT(ncp->nc_refs >= 0);
410 KKASSERT(ncp->nc_exlocks > 0);
411 KKASSERT(ncp->nc_locktd == td);
412
2247fe02
MD
413 count = ncp->nc_exlocks;
414 if ((count & ~NC_EXLOCK_REQ) == 1) {
415 ncp->nc_locktd = NULL;
f63911bf
MD
416 if (ncp->nc_vp)
417 vdrop(ncp->nc_vp);
2247fe02
MD
418 }
419 for (;;) {
420 if ((count & ~NC_EXLOCK_REQ) == 1) {
421 if (atomic_cmpset_int(&ncp->nc_exlocks, count, 0)) {
422 if (count & NC_EXLOCK_REQ)
423 wakeup(ncp);
424 break;
425 }
426 } else {
427 if (atomic_cmpset_int(&ncp->nc_exlocks, count,
428 count - 1)) {
429 break;
430 }
f63911bf 431 }
2247fe02 432 count = ncp->nc_exlocks;
f63911bf
MD
433 }
434}
435
984263bc
MD
436
437/*
7ea21ed1
MD
438 * cache_hold() and cache_drop() prevent the premature deletion of a
439 * namecache entry but do not prevent operations (such as zapping) on
440 * that namecache entry.
5b287bba 441 *
36e90efd
MD
442 * This routine may only be called from outside this source module if
443 * nc_refs is already at least 1.
5b287bba 444 *
36e90efd
MD
445 * This is a rare case where callers are allowed to hold a spinlock,
446 * so we can't ourselves.
61f96b6f
MD
447 *
448 * MPSAFE
984263bc 449 */
7ea21ed1
MD
450static __inline
451struct namecache *
bc0c094e 452_cache_hold(struct namecache *ncp)
7ea21ed1 453{
5b287bba 454 atomic_add_int(&ncp->nc_refs, 1);
7ea21ed1
MD
455 return(ncp);
456}
457
8c361dda 458/*
f63911bf
MD
459 * Drop a cache entry, taking care to deal with races.
460 *
461 * For potential 1->0 transitions we must hold the ncp lock to safely
462 * test its flags. An unresolved entry with no children must be zapped
463 * to avoid leaks.
464 *
465 * The call to cache_zap() itself will handle all remaining races and
466 * will decrement the ncp's refs regardless. If we are resolved or
467 * have children nc_refs can safely be dropped to 0 without having to
468 * zap the entry.
469 *
470 * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
471 *
472 * NOTE: cache_zap() may return a non-NULL referenced parent which must
473 * be dropped in a loop.
2247fe02
MD
474 *
475 * MPSAFE
8c361dda 476 */
7ea21ed1
MD
477static __inline
478void
bc0c094e 479_cache_drop(struct namecache *ncp)
7ea21ed1 480{
f63911bf
MD
481 int refs;
482
483 while (ncp) {
484 KKASSERT(ncp->nc_refs > 0);
485 refs = ncp->nc_refs;
486
487 if (refs == 1) {
488 if (_cache_lock_nonblock(ncp) == 0) {
055f5cc8 489 ncp->nc_flag &= ~NCF_DEFEREDZAP;
f63911bf
MD
490 if ((ncp->nc_flag & NCF_UNRESOLVED) &&
491 TAILQ_EMPTY(&ncp->nc_list)) {
65870584 492 ncp = cache_zap(ncp, 1);
f63911bf
MD
493 continue;
494 }
495 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
496 _cache_unlock(ncp);
497 break;
498 }
499 _cache_unlock(ncp);
500 }
501 } else {
502 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
503 break;
504 }
2247fe02 505 cpu_pause();
f517a1bb 506 }
7ea21ed1 507}
8987aad7 508
690a3127 509/*
2247fe02
MD
510 * Link a new namecache entry to its parent and to the hash table. Be
511 * careful to avoid races if vhold() blocks in the future.
512 *
513 * Both ncp and par must be referenced and locked.
514 *
515 * NOTE: The hash table spinlock is likely held during this call, we
516 * can't do anything fancy.
f63911bf 517 *
2247fe02 518 * MPSAFE
690a3127
MD
519 */
520static void
2247fe02
MD
521_cache_link_parent(struct namecache *ncp, struct namecache *par,
522 struct nchash_head *nchpp)
690a3127
MD
523{
524 KKASSERT(ncp->nc_parent == NULL);
525 ncp->nc_parent = par;
2247fe02 526 ncp->nc_head = nchpp;
aabd5ce8
MD
527
528 /*
529 * Set inheritance flags. Note that the parent flags may be
530 * stale due to getattr potentially not having been run yet
531 * (it gets run during nlookup()'s).
532 */
533 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
534 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
535 ncp->nc_flag |= NCF_SF_PNOCACHE;
536 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
537 ncp->nc_flag |= NCF_UF_PCACHE;
538
2247fe02
MD
539 LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
540
690a3127
MD
541 if (TAILQ_EMPTY(&par->nc_list)) {
542 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
21739618
MD
543 /*
544 * Any vp associated with an ncp which has children must
55361147 545 * be held to prevent it from being recycled.
21739618 546 */
690a3127 547 if (par->nc_vp)
2247fe02 548 vhold(par->nc_vp);
690a3127
MD
549 } else {
550 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
551 }
552}
553
554/*
2247fe02
MD
555 * Remove the parent and hash associations from a namecache structure.
556 * If this is the last child of the parent the cache_drop(par) will
557 * attempt to recursively zap the parent.
558 *
559 * ncp must be locked. This routine will acquire a temporary lock on
560 * the parent as wlel as the appropriate hash chain.
f63911bf 561 *
2247fe02 562 * MPSAFE
690a3127
MD
563 */
564static void
f63911bf 565_cache_unlink_parent(struct namecache *ncp)
690a3127
MD
566{
567 struct namecache *par;
f63911bf 568 struct vnode *dropvp;
690a3127
MD
569
570 if ((par = ncp->nc_parent) != NULL) {
2247fe02 571 KKASSERT(ncp->nc_parent == par);
f63911bf 572 _cache_hold(par);
2247fe02 573 _cache_lock(par);
287a8577 574 spin_lock(&ncp->nc_head->spin);
2247fe02 575 LIST_REMOVE(ncp, nc_hash);
690a3127 576 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
f63911bf 577 dropvp = NULL;
690a3127 578 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
f63911bf 579 dropvp = par->nc_vp;
287a8577 580 spin_unlock(&ncp->nc_head->spin);
2247fe02
MD
581 ncp->nc_parent = NULL;
582 ncp->nc_head = NULL;
583 _cache_unlock(par);
28623bf9 584 _cache_drop(par);
f63911bf
MD
585
586 /*
587 * We can only safely vdrop with no spinlocks held.
588 */
589 if (dropvp)
590 vdrop(dropvp);
690a3127
MD
591 }
592}
593
594/*
fad57d0e
MD
595 * Allocate a new namecache structure. Most of the code does not require
596 * zero-termination of the string but it makes vop_compat_ncreate() easier.
2247fe02
MD
597 *
598 * MPSAFE
690a3127
MD
599 */
600static struct namecache *
524c845c 601cache_alloc(int nlen)
690a3127
MD
602{
603 struct namecache *ncp;
604
efda3bd0 605 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
524c845c 606 if (nlen)
efda3bd0 607 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
524c845c 608 ncp->nc_nlen = nlen;
690a3127
MD
609 ncp->nc_flag = NCF_UNRESOLVED;
610 ncp->nc_error = ENOTCONN; /* needs to be resolved */
8c361dda 611 ncp->nc_refs = 1;
e4bff3c8 612
690a3127 613 TAILQ_INIT(&ncp->nc_list);
28623bf9 614 _cache_lock(ncp);
690a3127
MD
615 return(ncp);
616}
617
f63911bf
MD
618/*
619 * Can only be called for the case where the ncp has never been
620 * associated with anything (so no spinlocks are needed).
2247fe02
MD
621 *
622 * MPSAFE
f63911bf 623 */
8c361dda 624static void
28623bf9 625_cache_free(struct namecache *ncp)
8c361dda
MD
626{
627 KKASSERT(ncp->nc_refs == 1 && ncp->nc_exlocks == 1);
628 if (ncp->nc_name)
efda3bd0
MD
629 kfree(ncp->nc_name, M_VFSCACHE);
630 kfree(ncp, M_VFSCACHE);
8c361dda 631}
690a3127 632
2247fe02
MD
633/*
634 * MPSAFE
635 */
28623bf9
MD
636void
637cache_zero(struct nchandle *nch)
638{
639 nch->ncp = NULL;
640 nch->mount = NULL;
641}
642
690a3127
MD
643/*
644 * Ref and deref a namecache structure.
5b287bba 645 *
2247fe02
MD
646 * The caller must specify a stable ncp pointer, typically meaning the
647 * ncp is already referenced but this can also occur indirectly through
648 * e.g. holding a lock on a direct child.
649 *
650 * WARNING: Caller may hold an unrelated read spinlock, which means we can't
651 * use read spinlocks here.
61f96b6f
MD
652 *
653 * MPSAFE if nch is
690a3127 654 */
28623bf9
MD
655struct nchandle *
656cache_hold(struct nchandle *nch)
bc0c094e 657{
28623bf9 658 _cache_hold(nch->ncp);
61f96b6f 659 atomic_add_int(&nch->mount->mnt_refs, 1);
28623bf9 660 return(nch);
bc0c094e
MD
661}
662
61f96b6f
MD
663/*
664 * Create a copy of a namecache handle for an already-referenced
665 * entry.
666 *
667 * MPSAFE if nch is
668 */
bc0c094e 669void
28623bf9 670cache_copy(struct nchandle *nch, struct nchandle *target)
bc0c094e 671{
28623bf9 672 *target = *nch;
cf37bc1a
MD
673 if (target->ncp)
674 _cache_hold(target->ncp);
61f96b6f 675 atomic_add_int(&nch->mount->mnt_refs, 1);
28623bf9
MD
676}
677
61f96b6f
MD
678/*
679 * MPSAFE if nch is
680 */
28623bf9
MD
681void
682cache_changemount(struct nchandle *nch, struct mount *mp)
683{
61f96b6f 684 atomic_add_int(&nch->mount->mnt_refs, -1);
28623bf9 685 nch->mount = mp;
61f96b6f 686 atomic_add_int(&nch->mount->mnt_refs, 1);
28623bf9
MD
687}
688
2247fe02
MD
689/*
690 * MPSAFE
691 */
28623bf9
MD
692void
693cache_drop(struct nchandle *nch)
694{
61f96b6f 695 atomic_add_int(&nch->mount->mnt_refs, -1);
28623bf9
MD
696 _cache_drop(nch->ncp);
697 nch->ncp = NULL;
698 nch->mount = NULL;
bc0c094e
MD
699}
700
2247fe02
MD
701/*
702 * MPSAFE
703 */
28623bf9
MD
704void
705cache_lock(struct nchandle *nch)
706{
707 _cache_lock(nch->ncp);
708}
709
2247fe02
MD
710/*
711 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller
712 * is responsible for checking both for validity on return as they
713 * may have become invalid.
714 *
715 * We have to deal with potential deadlocks here, just ping pong
716 * the lock until we get it (we will always block somewhere when
717 * looping so this is not cpu-intensive).
718 *
719 * which = 0 nch1 not locked, nch2 is locked
720 * which = 1 nch1 is locked, nch2 is not locked
721 */
722void
723cache_relock(struct nchandle *nch1, struct ucred *cred1,
724 struct nchandle *nch2, struct ucred *cred2)
725{
726 int which;
727
728 which = 0;
729
730 for (;;) {
731 if (which == 0) {
732 if (cache_lock_nonblock(nch1) == 0) {
733 cache_resolve(nch1, cred1);
734 break;
735 }
736 cache_unlock(nch2);
737 cache_lock(nch1);
738 cache_resolve(nch1, cred1);
739 which = 1;
740 } else {
741 if (cache_lock_nonblock(nch2) == 0) {
742 cache_resolve(nch2, cred2);
743 break;
744 }
745 cache_unlock(nch1);
746 cache_lock(nch2);
747 cache_resolve(nch2, cred2);
748 which = 0;
749 }
750 }
751}
752
753/*
754 * MPSAFE
755 */
28623bf9
MD
756int
757cache_lock_nonblock(struct nchandle *nch)
758{
759 return(_cache_lock_nonblock(nch->ncp));
760}
761
14c92d03 762
2247fe02
MD
763/*
764 * MPSAFE
765 */
28623bf9
MD
766void
767cache_unlock(struct nchandle *nch)
768{
769 _cache_unlock(nch->ncp);
770}
771
14c92d03 772/*
690a3127 773 * ref-and-lock, unlock-and-deref functions.
9b1b3591
MD
774 *
775 * This function is primarily used by nlookup. Even though cache_lock
776 * holds the vnode, it is possible that the vnode may have already
f63911bf
MD
777 * initiated a recyclement.
778 *
779 * We want cache_get() to return a definitively usable vnode or a
780 * definitively unresolved ncp.
2247fe02
MD
781 *
782 * MPSAFE
14c92d03 783 */
28623bf9 784static
21739618 785struct namecache *
28623bf9 786_cache_get(struct namecache *ncp)
690a3127
MD
787{
788 _cache_hold(ncp);
28623bf9 789 _cache_lock(ncp);
9b1b3591 790 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
28623bf9 791 _cache_setunresolved(ncp);
21739618 792 return(ncp);
690a3127
MD
793}
794
28623bf9 795/*
2247fe02 796 * This is a special form of _cache_lock() which only succeeds if
f63911bf
MD
797 * it can get a pristine, non-recursive lock. The caller must have
798 * already ref'd the ncp.
799 *
800 * On success the ncp will be locked, on failure it will not. The
801 * ref count does not change either way.
802 *
2247fe02 803 * We want _cache_lock_special() (on success) to return a definitively
f63911bf 804 * usable vnode or a definitively unresolved ncp.
2247fe02
MD
805 *
806 * MPSAFE
f63911bf
MD
807 */
808static int
2247fe02 809_cache_lock_special(struct namecache *ncp)
f63911bf
MD
810{
811 if (_cache_lock_nonblock(ncp) == 0) {
2247fe02 812 if ((ncp->nc_exlocks & ~NC_EXLOCK_REQ) == 1) {
f63911bf
MD
813 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
814 _cache_setunresolved(ncp);
815 return(0);
816 }
817 _cache_unlock(ncp);
818 }
819 return(EWOULDBLOCK);
820}
821
822
823/*
824 * NOTE: The same nchandle can be passed for both arguments.
2247fe02
MD
825 *
826 * MPSAFE
28623bf9
MD
827 */
828void
829cache_get(struct nchandle *nch, struct nchandle *target)
830{
f63911bf 831 KKASSERT(nch->ncp->nc_refs > 0);
28623bf9
MD
832 target->mount = nch->mount;
833 target->ncp = _cache_get(nch->ncp);
61f96b6f 834 atomic_add_int(&target->mount->mnt_refs, 1);
28623bf9
MD
835}
836
2247fe02
MD
837/*
838 * MPSAFE
839 */
28623bf9 840static __inline
690a3127 841void
28623bf9 842_cache_put(struct namecache *ncp)
14c92d03 843{
28623bf9 844 _cache_unlock(ncp);
14c92d03
MD
845 _cache_drop(ncp);
846}
847
2247fe02
MD
848/*
849 * MPSAFE
850 */
28623bf9
MD
851void
852cache_put(struct nchandle *nch)
853{
61f96b6f 854 atomic_add_int(&nch->mount->mnt_refs, -1);
28623bf9
MD
855 _cache_put(nch->ncp);
856 nch->ncp = NULL;
857 nch->mount = NULL;
858}
859
690a3127 860/*
690a3127
MD
861 * Resolve an unresolved ncp by associating a vnode with it. If the
862 * vnode is NULL, a negative cache entry is created.
863 *
864 * The ncp should be locked on entry and will remain locked on return.
2247fe02
MD
865 *
866 * MPSAFE
690a3127 867 */
28623bf9 868static
690a3127 869void
4b5bbb78 870_cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
ce6da7e4 871{
690a3127 872 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2247fe02 873
ce6da7e4 874 if (vp != NULL) {
21739618
MD
875 /*
876 * Any vp associated with an ncp which has children must
55361147 877 * be held. Any vp associated with a locked ncp must be held.
21739618
MD
878 */
879 if (!TAILQ_EMPTY(&ncp->nc_list))
880 vhold(vp);
287a8577 881 spin_lock(&vp->v_spinlock);
f63911bf 882 ncp->nc_vp = vp;
ce6da7e4 883 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
287a8577 884 spin_unlock(&vp->v_spinlock);
55361147
MD
885 if (ncp->nc_exlocks)
886 vhold(vp);
21739618
MD
887
888 /*
3c37c940 889 * Set auxiliary flags
21739618 890 */
690a3127
MD
891 switch(vp->v_type) {
892 case VDIR:
21739618
MD
893 ncp->nc_flag |= NCF_ISDIR;
894 break;
690a3127 895 case VLNK:
21739618
MD
896 ncp->nc_flag |= NCF_ISSYMLINK;
897 /* XXX cache the contents of the symlink */
898 break;
690a3127 899 default:
21739618 900 break;
690a3127 901 }
f63911bf 902 atomic_add_int(&numcache, 1);
21739618 903 ncp->nc_error = 0;
ce6da7e4 904 } else {
4b5bbb78
MD
905 /*
906 * When creating a negative cache hit we set the
907 * namecache_gen. A later resolve will clean out the
908 * negative cache hit if the mount point's namecache_gen
909 * has changed. Used by devfs, could also be used by
910 * other remote FSs.
911 */
f63911bf 912 ncp->nc_vp = NULL;
287a8577 913 spin_lock(&ncspin);
1345c2b6 914 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
ce6da7e4 915 ++numneg;
287a8577 916 spin_unlock(&ncspin);
21739618 917 ncp->nc_error = ENOENT;
4b5bbb78
MD
918 if (mp)
919 ncp->nc_namecache_gen = mp->mnt_namecache_gen;
ce6da7e4 920 }
65870584 921 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
ce6da7e4
MD
922}
923
2247fe02
MD
924/*
925 * MPSAFE
926 */
fad57d0e 927void
28623bf9 928cache_setvp(struct nchandle *nch, struct vnode *vp)
fad57d0e 929{
4b5bbb78 930 _cache_setvp(nch->mount, nch->ncp, vp);
28623bf9
MD
931}
932
2247fe02
MD
933/*
934 * MPSAFE
935 */
28623bf9
MD
936void
937cache_settimeout(struct nchandle *nch, int nticks)
938{
939 struct namecache *ncp = nch->ncp;
940
fad57d0e
MD
941 if ((ncp->nc_timeout = ticks + nticks) == 0)
942 ncp->nc_timeout = 1;
943}
944
690a3127
MD
945/*
946 * Disassociate the vnode or negative-cache association and mark a
947 * namecache entry as unresolved again. Note that the ncp is still
948 * left in the hash table and still linked to its parent.
949 *
67773eb3
MD
950 * The ncp should be locked and refd on entry and will remain locked and refd
951 * on return.
8c361dda
MD
952 *
953 * This routine is normally never called on a directory containing children.
954 * However, NFS often does just that in its rename() code as a cop-out to
955 * avoid complex namespace operations. This disconnects a directory vnode
956 * from its namecache and can cause the OLDAPI and NEWAPI to get out of
957 * sync.
2247fe02
MD
958 *
959 * MPSAFE
690a3127 960 */
28623bf9 961static
690a3127 962void
28623bf9 963_cache_setunresolved(struct namecache *ncp)
14c92d03 964{
690a3127 965 struct vnode *vp;
14c92d03 966
690a3127
MD
967 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
968 ncp->nc_flag |= NCF_UNRESOLVED;
fad57d0e 969 ncp->nc_timeout = 0;
690a3127 970 ncp->nc_error = ENOTCONN;
690a3127 971 if ((vp = ncp->nc_vp) != NULL) {
f63911bf 972 atomic_add_int(&numcache, -1);
287a8577 973 spin_lock(&vp->v_spinlock);
fad57d0e 974 ncp->nc_vp = NULL;
690a3127 975 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
287a8577 976 spin_unlock(&vp->v_spinlock);
55361147
MD
977
978 /*
979 * Any vp associated with an ncp with children is
980 * held by that ncp. Any vp associated with a locked
981 * ncp is held by that ncp. These conditions must be
982 * undone when the vp is cleared out from the ncp.
983 */
690a3127
MD
984 if (!TAILQ_EMPTY(&ncp->nc_list))
985 vdrop(vp);
55361147
MD
986 if (ncp->nc_exlocks)
987 vdrop(vp);
690a3127 988 } else {
287a8577 989 spin_lock(&ncspin);
690a3127
MD
990 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
991 --numneg;
287a8577 992 spin_unlock(&ncspin);
690a3127 993 }
d98152a8 994 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
8e005a45
MD
995 }
996}
8c361dda 997
4b5bbb78
MD
998/*
999 * The cache_nresolve() code calls this function to automatically
1000 * set a resolved cache element to unresolved if it has timed out
1001 * or if it is a negative cache hit and the mount point namecache_gen
1002 * has changed.
2247fe02
MD
1003 *
1004 * MPSAFE
4b5bbb78
MD
1005 */
1006static __inline void
1007_cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1008{
1009 /*
1010 * Already in an unresolved state, nothing to do.
1011 */
1012 if (ncp->nc_flag & NCF_UNRESOLVED)
1013 return;
1014
1015 /*
1016 * Try to zap entries that have timed out. We have
1017 * to be careful here because locked leafs may depend
1018 * on the vnode remaining intact in a parent, so only
1019 * do this under very specific conditions.
1020 */
1021 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1022 TAILQ_EMPTY(&ncp->nc_list)) {
1023 _cache_setunresolved(ncp);
1024 return;
1025 }
1026
1027 /*
1028 * If a resolved negative cache hit is invalid due to
1029 * the mount's namecache generation being bumped, zap it.
1030 */
1031 if (ncp->nc_vp == NULL &&
1032 ncp->nc_namecache_gen != mp->mnt_namecache_gen) {
1033 _cache_setunresolved(ncp);
1034 return;
1035 }
1036}
1037
2247fe02
MD
1038/*
1039 * MPSAFE
1040 */
1d505369 1041void
28623bf9 1042cache_setunresolved(struct nchandle *nch)
1d505369 1043{
28623bf9 1044 _cache_setunresolved(nch->ncp);
1d505369
MD
1045}
1046
1047/*
28623bf9
MD
1048 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1049 * looking for matches. This flag tells the lookup code when it must
1050 * check for a mount linkage and also prevents the directories in question
1051 * from being deleted or renamed.
2247fe02
MD
1052 *
1053 * MPSAFE
1d505369 1054 */
28623bf9
MD
1055static
1056int
1057cache_clrmountpt_callback(struct mount *mp, void *data)
1058{
1059 struct nchandle *nch = data;
1060
1061 if (mp->mnt_ncmounton.ncp == nch->ncp)
1062 return(1);
1063 if (mp->mnt_ncmountpt.ncp == nch->ncp)
1064 return(1);
1065 return(0);
1066}
1067
2247fe02
MD
1068/*
1069 * MPSAFE
1070 */
1d505369 1071void
28623bf9 1072cache_clrmountpt(struct nchandle *nch)
1d505369 1073{
28623bf9
MD
1074 int count;
1075
1076 count = mountlist_scan(cache_clrmountpt_callback, nch,
1077 MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1078 if (count == 0)
1079 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1d505369
MD
1080}
1081
1082/*
e09206ba
MD
1083 * Invalidate portions of the namecache topology given a starting entry.
1084 * The passed ncp is set to an unresolved state and:
8e005a45 1085 *
2247fe02
MD
1086 * The passed ncp must be referencxed and locked. The routine may unlock
1087 * and relock ncp several times, and will recheck the children and loop
1088 * to catch races. When done the passed ncp will be returned with the
1089 * reference and lock intact.
e09206ba
MD
1090 *
1091 * CINV_DESTROY - Set a flag in the passed ncp entry indicating
1092 * that the physical underlying nodes have been
1093 * destroyed... as in deleted. For example, when
1094 * a directory is removed. This will cause record
1095 * lookups on the name to no longer be able to find
1096 * the record and tells the resolver to return failure
1097 * rather then trying to resolve through the parent.
1098 *
1099 * The topology itself, including ncp->nc_name,
1100 * remains intact.
1101 *
1102 * This only applies to the passed ncp, if CINV_CHILDREN
1103 * is specified the children are not flagged.
1104 *
1105 * CINV_CHILDREN - Set all children (recursively) to an unresolved
1106 * state as well.
1107 *
1108 * Note that this will also have the side effect of
1109 * cleaning out any unreferenced nodes in the topology
1110 * from the leaves up as the recursion backs out.
1111 *
2247fe02
MD
1112 * Note that the topology for any referenced nodes remains intact, but
1113 * the nodes will be marked as having been destroyed and will be set
1114 * to an unresolved state.
25cb3304
MD
1115 *
1116 * It is possible for cache_inval() to race a cache_resolve(), meaning that
1117 * the namecache entry may not actually be invalidated on return if it was
1118 * revalidated while recursing down into its children. This code guarentees
1119 * that the node(s) will go through an invalidation cycle, but does not
1120 * guarentee that they will remain in an invalidated state.
1121 *
1122 * Returns non-zero if a revalidation was detected during the invalidation
1123 * recursion, zero otherwise. Note that since only the original ncp is
1124 * locked the revalidation ultimately can only indicate that the original ncp
1125 * *MIGHT* no have been reresolved.
bf40a153
MD
1126 *
1127 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1128 * have to avoid blowing out the kernel stack. We do this by saving the
1129 * deep namecache node and aborting the recursion, then re-recursing at that
1130 * node using a depth-first algorithm in order to allow multiple deep
1131 * recursions to chain through each other, then we restart the invalidation
1132 * from scratch.
2247fe02
MD
1133 *
1134 * MPSAFE
8e005a45 1135 */
bf40a153
MD
1136
1137struct cinvtrack {
1138 struct namecache *resume_ncp;
1139 int depth;
1140};
1141
28623bf9 1142static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
bf40a153 1143
28623bf9 1144static
25cb3304 1145int
28623bf9 1146_cache_inval(struct namecache *ncp, int flags)
8e005a45 1147{
bf40a153
MD
1148 struct cinvtrack track;
1149 struct namecache *ncp2;
1150 int r;
1151
1152 track.depth = 0;
1153 track.resume_ncp = NULL;
1154
1155 for (;;) {
28623bf9 1156 r = _cache_inval_internal(ncp, flags, &track);
bf40a153
MD
1157 if (track.resume_ncp == NULL)
1158 break;
6ea70f76 1159 kprintf("Warning: deep namecache recursion at %s\n",
bf40a153 1160 ncp->nc_name);
28623bf9 1161 _cache_unlock(ncp);
bf40a153
MD
1162 while ((ncp2 = track.resume_ncp) != NULL) {
1163 track.resume_ncp = NULL;
28623bf9
MD
1164 _cache_lock(ncp2);
1165 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
bf40a153 1166 &track);
28623bf9 1167 _cache_put(ncp2);
bf40a153 1168 }
28623bf9 1169 _cache_lock(ncp);
bf40a153
MD
1170 }
1171 return(r);
1172}
1173
28623bf9
MD
1174int
1175cache_inval(struct nchandle *nch, int flags)
1176{
1177 return(_cache_inval(nch->ncp, flags));
1178}
1179
2247fe02
MD
1180/*
1181 * Helper for _cache_inval(). The passed ncp is refd and locked and
1182 * remains that way on return, but may be unlocked/relocked multiple
1183 * times by the routine.
1184 */
bf40a153 1185static int
28623bf9 1186_cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
bf40a153 1187{
8e005a45 1188 struct namecache *kid;
b8997912 1189 struct namecache *nextkid;
25cb3304 1190 int rcnt = 0;
8e005a45 1191
e09206ba 1192 KKASSERT(ncp->nc_exlocks);
25cb3304 1193
28623bf9 1194 _cache_setunresolved(ncp);
e09206ba
MD
1195 if (flags & CINV_DESTROY)
1196 ncp->nc_flag |= NCF_DESTROYED;
e09206ba
MD
1197 if ((flags & CINV_CHILDREN) &&
1198 (kid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1199 ) {
f63911bf 1200 _cache_hold(kid);
bf40a153
MD
1201 if (++track->depth > MAX_RECURSION_DEPTH) {
1202 track->resume_ncp = ncp;
28623bf9 1203 _cache_hold(ncp);
bf40a153
MD
1204 ++rcnt;
1205 }
28623bf9 1206 _cache_unlock(ncp);
b8997912 1207 while (kid) {
bf40a153 1208 if (track->resume_ncp) {
28623bf9 1209 _cache_drop(kid);
bf40a153
MD
1210 break;
1211 }
b8997912 1212 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
28623bf9 1213 _cache_hold(nextkid);
e09206ba
MD
1214 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1215 TAILQ_FIRST(&kid->nc_list)
b8997912 1216 ) {
28623bf9
MD
1217 _cache_lock(kid);
1218 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
1219 _cache_unlock(kid);
b8997912 1220 }
28623bf9 1221 _cache_drop(kid);
fad57d0e 1222 kid = nextkid;
8e005a45 1223 }
bf40a153 1224 --track->depth;
28623bf9 1225 _cache_lock(ncp);
8e005a45 1226 }
25cb3304
MD
1227
1228 /*
1229 * Someone could have gotten in there while ncp was unlocked,
1230 * retry if so.
1231 */
1232 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1233 ++rcnt;
1234 return (rcnt);
8e005a45
MD
1235}
1236
e09206ba 1237/*
25cb3304
MD
1238 * Invalidate a vnode's namecache associations. To avoid races against
1239 * the resolver we do not invalidate a node which we previously invalidated
1240 * but which was then re-resolved while we were in the invalidation loop.
1241 *
1242 * Returns non-zero if any namecache entries remain after the invalidation
1243 * loop completed.
2aefb2c5 1244 *
f63911bf
MD
1245 * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1246 * be ripped out of the topology while held, the vnode's v_namecache
1247 * list has no such restriction. NCP's can be ripped out of the list
1248 * at virtually any time if not locked, even if held.
1249 *
1250 * In addition, the v_namecache list itself must be locked via
1251 * the vnode's spinlock.
2247fe02
MD
1252 *
1253 * MPSAFE
e09206ba 1254 */
25cb3304 1255int
6b008938 1256cache_inval_vp(struct vnode *vp, int flags)
8e005a45
MD
1257{
1258 struct namecache *ncp;
25cb3304
MD
1259 struct namecache *next;
1260
2aefb2c5 1261restart:
287a8577 1262 spin_lock(&vp->v_spinlock);
25cb3304
MD
1263 ncp = TAILQ_FIRST(&vp->v_namecache);
1264 if (ncp)
28623bf9 1265 _cache_hold(ncp);
25cb3304 1266 while (ncp) {
f63911bf 1267 /* loop entered with ncp held and vp spin-locked */
2aefb2c5 1268 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
28623bf9 1269 _cache_hold(next);
287a8577 1270 spin_unlock(&vp->v_spinlock);
28623bf9 1271 _cache_lock(ncp);
2aefb2c5 1272 if (ncp->nc_vp != vp) {
6ea70f76 1273 kprintf("Warning: cache_inval_vp: race-A detected on "
2aefb2c5 1274 "%s\n", ncp->nc_name);
28623bf9 1275 _cache_put(ncp);
69313361 1276 if (next)
28623bf9 1277 _cache_drop(next);
2aefb2c5
MD
1278 goto restart;
1279 }
28623bf9
MD
1280 _cache_inval(ncp, flags);
1281 _cache_put(ncp); /* also releases reference */
25cb3304 1282 ncp = next;
287a8577 1283 spin_lock(&vp->v_spinlock);
2aefb2c5 1284 if (ncp && ncp->nc_vp != vp) {
287a8577 1285 spin_unlock(&vp->v_spinlock);
6ea70f76 1286 kprintf("Warning: cache_inval_vp: race-B detected on "
2aefb2c5 1287 "%s\n", ncp->nc_name);
28623bf9 1288 _cache_drop(ncp);
2aefb2c5
MD
1289 goto restart;
1290 }
690a3127 1291 }
287a8577 1292 spin_unlock(&vp->v_spinlock);
25cb3304 1293 return(TAILQ_FIRST(&vp->v_namecache) != NULL);
5c6c3cac
MD
1294}
1295
1296/*
1297 * This routine is used instead of the normal cache_inval_vp() when we
1298 * are trying to recycle otherwise good vnodes.
1299 *
1300 * Return 0 on success, non-zero if not all namecache records could be
1301 * disassociated from the vnode (for various reasons).
2247fe02
MD
1302 *
1303 * MPSAFE
5c6c3cac
MD
1304 */
1305int
1306cache_inval_vp_nonblock(struct vnode *vp)
1307{
1308 struct namecache *ncp;
1309 struct namecache *next;
1310
287a8577 1311 spin_lock(&vp->v_spinlock);
5c6c3cac
MD
1312 ncp = TAILQ_FIRST(&vp->v_namecache);
1313 if (ncp)
1314 _cache_hold(ncp);
1315 while (ncp) {
1316 /* loop entered with ncp held */
1317 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1318 _cache_hold(next);
287a8577 1319 spin_unlock(&vp->v_spinlock);
5c6c3cac
MD
1320 if (_cache_lock_nonblock(ncp)) {
1321 _cache_drop(ncp);
1322 if (next)
1323 _cache_drop(next);
2247fe02 1324 goto done;
5c6c3cac
MD
1325 }
1326 if (ncp->nc_vp != vp) {
1327 kprintf("Warning: cache_inval_vp: race-A detected on "
1328 "%s\n", ncp->nc_name);
1329 _cache_put(ncp);
1330 if (next)
1331 _cache_drop(next);
2247fe02 1332 goto done;
5c6c3cac
MD
1333 }
1334 _cache_inval(ncp, 0);
1335 _cache_put(ncp); /* also releases reference */
1336 ncp = next;
287a8577 1337 spin_lock(&vp->v_spinlock);
5c6c3cac 1338 if (ncp && ncp->nc_vp != vp) {
287a8577 1339 spin_unlock(&vp->v_spinlock);
5c6c3cac
MD
1340 kprintf("Warning: cache_inval_vp: race-B detected on "
1341 "%s\n", ncp->nc_name);
1342 _cache_drop(ncp);
2247fe02 1343 goto done;
5c6c3cac
MD
1344 }
1345 }
287a8577 1346 spin_unlock(&vp->v_spinlock);
2247fe02 1347done:
5c6c3cac 1348 return(TAILQ_FIRST(&vp->v_namecache) != NULL);
14c92d03 1349}
14c92d03 1350
984263bc 1351/*
fad57d0e 1352 * The source ncp has been renamed to the target ncp. Both fncp and tncp
227cf16d
MD
1353 * must be locked. The target ncp is destroyed (as a normal rename-over
1354 * would destroy the target file or directory).
fad57d0e 1355 *
227cf16d
MD
1356 * Because there may be references to the source ncp we cannot copy its
1357 * contents to the target. Instead the source ncp is relinked as the target
1358 * and the target ncp is removed from the namecache topology.
2247fe02
MD
1359 *
1360 * MPSAFE
fad57d0e
MD
1361 */
1362void
28623bf9 1363cache_rename(struct nchandle *fnch, struct nchandle *tnch)
fad57d0e 1364{
28623bf9
MD
1365 struct namecache *fncp = fnch->ncp;
1366 struct namecache *tncp = tnch->ncp;
2247fe02
MD
1367 struct namecache *tncp_par;
1368 struct nchash_head *nchpp;
1369 u_int32_t hash;
227cf16d 1370 char *oname;
fad57d0e 1371
2247fe02
MD
1372 /*
1373 * Rename fncp (unlink)
1374 */
f63911bf 1375 _cache_unlink_parent(fncp);
227cf16d
MD
1376 oname = fncp->nc_name;
1377 fncp->nc_name = tncp->nc_name;
1378 fncp->nc_nlen = tncp->nc_nlen;
2247fe02
MD
1379 tncp_par = tncp->nc_parent;
1380 _cache_hold(tncp_par);
1381 _cache_lock(tncp_par);
1382
1383 /*
1384 * Rename fncp (relink)
1385 */
1386 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1387 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1388 nchpp = NCHHASH(hash);
1389
287a8577 1390 spin_lock(&nchpp->spin);
2247fe02 1391 _cache_link_parent(fncp, tncp_par, nchpp);
287a8577 1392 spin_unlock(&nchpp->spin);
2247fe02
MD
1393
1394 _cache_put(tncp_par);
1395
1396 /*
1397 * Get rid of the overwritten tncp (unlink)
1398 */
1399 _cache_setunresolved(tncp);
1400 _cache_unlink_parent(tncp);
227cf16d
MD
1401 tncp->nc_name = NULL;
1402 tncp->nc_nlen = 0;
f63911bf 1403
227cf16d
MD
1404 if (oname)
1405 kfree(oname, M_VFSCACHE);
fad57d0e
MD
1406}
1407
1408/*
21739618 1409 * vget the vnode associated with the namecache entry. Resolve the namecache
2247fe02 1410 * entry if necessary. The passed ncp must be referenced and locked.
21739618
MD
1411 *
1412 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked
1413 * (depending on the passed lk_type) will be returned in *vpp with an error
1414 * of 0, or NULL will be returned in *vpp with a non-0 error code. The
1415 * most typical error is ENOENT, meaning that the ncp represents a negative
1416 * cache hit and there is no vnode to retrieve, but other errors can occur
1417 * too.
1418 *
2247fe02
MD
1419 * The vget() can race a reclaim. If this occurs we re-resolve the
1420 * namecache entry.
1421 *
1422 * There are numerous places in the kernel where vget() is called on a
1423 * vnode while one or more of its namecache entries is locked. Releasing
1424 * a vnode never deadlocks against locked namecache entries (the vnode
1425 * will not get recycled while referenced ncp's exist). This means we
1426 * can safely acquire the vnode. In fact, we MUST NOT release the ncp
1427 * lock when acquiring the vp lock or we might cause a deadlock.
1428 *
1429 * MPSAFE
21739618
MD
1430 */
1431int
28623bf9 1432cache_vget(struct nchandle *nch, struct ucred *cred,
21739618
MD
1433 int lk_type, struct vnode **vpp)
1434{
28623bf9 1435 struct namecache *ncp;
21739618
MD
1436 struct vnode *vp;
1437 int error;
1438
28623bf9 1439 ncp = nch->ncp;
2247fe02 1440 KKASSERT(ncp->nc_locktd == curthread);
21739618
MD
1441again:
1442 vp = NULL;
2247fe02 1443 if (ncp->nc_flag & NCF_UNRESOLVED)
28623bf9 1444 error = cache_resolve(nch, cred);
2247fe02 1445 else
21739618 1446 error = 0;
2247fe02 1447
21739618 1448 if (error == 0 && (vp = ncp->nc_vp) != NULL) {
87de5057 1449 error = vget(vp, lk_type);
21739618 1450 if (error) {
2247fe02
MD
1451 /*
1452 * VRECLAIM race
1453 */
1454 if (error == ENOENT) {
1455 kprintf("Warning: vnode reclaim race detected "
1456 "in cache_vget on %p (%s)\n",
1457 vp, ncp->nc_name);
1458 _cache_setunresolved(ncp);
21739618 1459 goto again;
2247fe02
MD
1460 }
1461
1462 /*
1463 * Not a reclaim race, some other error.
1464 */
1465 KKASSERT(ncp->nc_vp == vp);
21739618 1466 vp = NULL;
2247fe02
MD
1467 } else {
1468 KKASSERT(ncp->nc_vp == vp);
1469 KKASSERT((vp->v_flag & VRECLAIMED) == 0);
21739618
MD
1470 }
1471 }
1472 if (error == 0 && vp == NULL)
1473 error = ENOENT;
1474 *vpp = vp;
1475 return(error);
1476}
1477
1478int
28623bf9 1479cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
21739618 1480{
28623bf9 1481 struct namecache *ncp;
21739618
MD
1482 struct vnode *vp;
1483 int error;
1484
28623bf9 1485 ncp = nch->ncp;
2247fe02 1486 KKASSERT(ncp->nc_locktd == curthread);
21739618
MD
1487again:
1488 vp = NULL;
2247fe02 1489 if (ncp->nc_flag & NCF_UNRESOLVED)
28623bf9 1490 error = cache_resolve(nch, cred);
2247fe02 1491 else
21739618 1492 error = 0;
2247fe02 1493
21739618 1494 if (error == 0 && (vp = ncp->nc_vp) != NULL) {
2247fe02
MD
1495 error = vget(vp, LK_SHARED);
1496 if (error) {
1497 /*
1498 * VRECLAIM race
1499 */
3c37c940 1500 if (error == ENOENT) {
2247fe02
MD
1501 kprintf("Warning: vnode reclaim race detected "
1502 "in cache_vget on %p (%s)\n",
1503 vp, ncp->nc_name);
3c37c940 1504 _cache_setunresolved(ncp);
3c37c940
MD
1505 goto again;
1506 }
2247fe02
MD
1507
1508 /*
1509 * Not a reclaim race, some other error.
1510 */
1511 KKASSERT(ncp->nc_vp == vp);
1512 vp = NULL;
3c37c940 1513 } else {
2247fe02
MD
1514 KKASSERT(ncp->nc_vp == vp);
1515 KKASSERT((vp->v_flag & VRECLAIMED) == 0);
3c37c940
MD
1516 /* caller does not want a lock */
1517 vn_unlock(vp);
21739618
MD
1518 }
1519 }
1520 if (error == 0 && vp == NULL)
1521 error = ENOENT;
1522 *vpp = vp;
1523 return(error);
1524}
1525
dc1be39c 1526/*
c0c70b27 1527 * Return a referenced vnode representing the parent directory of
f63911bf
MD
1528 * ncp.
1529 *
1530 * Because the caller has locked the ncp it should not be possible for
1531 * the parent ncp to go away. However, the parent can unresolve its
1532 * dvp at any time so we must be able to acquire a lock on the parent
1533 * to safely access nc_vp.
5312fa43 1534 *
f63911bf
MD
1535 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
1536 * so use vhold()/vdrop() while holding the lock to prevent dvp from
1537 * getting destroyed.
2247fe02
MD
1538 *
1539 * MPSAFE - Note vhold() is allowed when dvp has 0 refs if we hold a
1540 * lock on the ncp in question..
c0c70b27 1541 */
5312fa43 1542static struct vnode *
c0c70b27
MD
1543cache_dvpref(struct namecache *ncp)
1544{
5312fa43 1545 struct namecache *par;
c0c70b27 1546 struct vnode *dvp;
c0c70b27 1547
5312fa43
MD
1548 dvp = NULL;
1549 if ((par = ncp->nc_parent) != NULL) {
f63911bf 1550 _cache_hold(par);
2247fe02
MD
1551 _cache_lock(par);
1552 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
1553 if ((dvp = par->nc_vp) != NULL)
1554 vhold(dvp);
1555 }
1556 _cache_unlock(par);
1557 if (dvp) {
1558 if (vget(dvp, LK_SHARED) == 0) {
1559 vn_unlock(dvp);
1560 vdrop(dvp);
1561 /* return refd, unlocked dvp */
1562 } else {
1563 vdrop(dvp);
1564 dvp = NULL;
5312fa43
MD
1565 }
1566 }
f63911bf 1567 _cache_drop(par);
5312fa43
MD
1568 }
1569 return(dvp);
c0c70b27
MD
1570}
1571
1572/*
fad57d0e
MD
1573 * Convert a directory vnode to a namecache record without any other
1574 * knowledge of the topology. This ONLY works with directory vnodes and
1575 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the
1576 * returned ncp (if not NULL) will be held and unlocked.
1577 *
1578 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
1579 * If 'makeit' is 1 we attempt to track-down and create the namecache topology
1580 * for dvp. This will fail only if the directory has been deleted out from
1581 * under the caller.
1582 *
1583 * Callers must always check for a NULL return no matter the value of 'makeit'.
a0d57516
MD
1584 *
1585 * To avoid underflowing the kernel stack each recursive call increments
1586 * the makeit variable.
fad57d0e
MD
1587 */
1588
28623bf9 1589static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
33387738 1590 struct vnode *dvp, char *fakename);
a0d57516 1591static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
cc4c3b52 1592 struct vnode **saved_dvp);
fad57d0e 1593
28623bf9
MD
1594int
1595cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
1596 struct nchandle *nch)
fad57d0e 1597{
cc4c3b52 1598 struct vnode *saved_dvp;
fad57d0e 1599 struct vnode *pvp;
33387738 1600 char *fakename;
fad57d0e
MD
1601 int error;
1602
28623bf9
MD
1603 nch->ncp = NULL;
1604 nch->mount = dvp->v_mount;
cc4c3b52 1605 saved_dvp = NULL;
33387738 1606 fakename = NULL;
a0d57516 1607
fad57d0e 1608 /*
269a08e4
MD
1609 * Handle the makeit == 0 degenerate case
1610 */
1611 if (makeit == 0) {
287a8577 1612 spin_lock(&dvp->v_spinlock);
269a08e4
MD
1613 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
1614 if (nch->ncp)
1615 cache_hold(nch);
287a8577 1616 spin_unlock(&dvp->v_spinlock);
269a08e4
MD
1617 }
1618
1619 /*
f63911bf 1620 * Loop until resolution, inside code will break out on error.
fad57d0e 1621 */
f63911bf
MD
1622 while (makeit) {
1623 /*
1624 * Break out if we successfully acquire a working ncp.
1625 */
287a8577 1626 spin_lock(&dvp->v_spinlock);
28623bf9 1627 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
f63911bf
MD
1628 if (nch->ncp) {
1629 cache_hold(nch);
287a8577 1630 spin_unlock(&dvp->v_spinlock);
f63911bf
MD
1631 break;
1632 }
287a8577 1633 spin_unlock(&dvp->v_spinlock);
fad57d0e 1634
fad57d0e
MD
1635 /*
1636 * If dvp is the root of its filesystem it should already
1637 * have a namecache pointer associated with it as a side
1638 * effect of the mount, but it may have been disassociated.
1639 */
1640 if (dvp->v_flag & VROOT) {
28623bf9
MD
1641 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
1642 error = cache_resolve_mp(nch->mount);
1643 _cache_put(nch->ncp);
fad57d0e 1644 if (ncvp_debug) {
6ea70f76 1645 kprintf("cache_fromdvp: resolve root of mount %p error %d",
fad57d0e
MD
1646 dvp->v_mount, error);
1647 }
1648 if (error) {
1649 if (ncvp_debug)
6ea70f76 1650 kprintf(" failed\n");
28623bf9 1651 nch->ncp = NULL;
fad57d0e
MD
1652 break;
1653 }
1654 if (ncvp_debug)
6ea70f76 1655 kprintf(" succeeded\n");
fad57d0e
MD
1656 continue;
1657 }
1658
1659 /*
a0d57516
MD
1660 * If we are recursed too deeply resort to an O(n^2)
1661 * algorithm to resolve the namecache topology. The
cc4c3b52 1662 * resolved pvp is left referenced in saved_dvp to
a0d57516
MD
1663 * prevent the tree from being destroyed while we loop.
1664 */
1665 if (makeit > 20) {
cc4c3b52 1666 error = cache_fromdvp_try(dvp, cred, &saved_dvp);
a0d57516 1667 if (error) {
6ea70f76 1668 kprintf("lookupdotdot(longpath) failed %d "
a0d57516 1669 "dvp %p\n", error, dvp);
1142bff7 1670 nch->ncp = NULL;
a0d57516
MD
1671 break;
1672 }
1673 continue;
1674 }
1675
1676 /*
fad57d0e
MD
1677 * Get the parent directory and resolve its ncp.
1678 */
33387738
MD
1679 if (fakename) {
1680 kfree(fakename, M_TEMP);
1681 fakename = NULL;
1682 }
1683 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
1684 &fakename);
fad57d0e 1685 if (error) {
6ea70f76 1686 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
fad57d0e
MD
1687 break;
1688 }
a11aaa81 1689 vn_unlock(pvp);
fad57d0e
MD
1690
1691 /*
1142bff7
MD
1692 * Reuse makeit as a recursion depth counter. On success
1693 * nch will be fully referenced.
fad57d0e 1694 */
28623bf9 1695 cache_fromdvp(pvp, cred, makeit + 1, nch);
fad57d0e 1696 vrele(pvp);
28623bf9 1697 if (nch->ncp == NULL)
fad57d0e
MD
1698 break;
1699
1700 /*
1701 * Do an inefficient scan of pvp (embodied by ncp) to look
1702 * for dvp. This will create a namecache record for dvp on
1703 * success. We loop up to recheck on success.
1704 *
1705 * ncp and dvp are both held but not locked.
1706 */
33387738 1707 error = cache_inefficient_scan(nch, cred, dvp, fakename);
fad57d0e 1708 if (error) {
6ea70f76 1709 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
28623bf9 1710 pvp, nch->ncp->nc_name, dvp);
1142bff7
MD
1711 cache_drop(nch);
1712 /* nch was NULLed out, reload mount */
1713 nch->mount = dvp->v_mount;
fad57d0e
MD
1714 break;
1715 }
1716 if (ncvp_debug) {
6ea70f76 1717 kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
28623bf9 1718 pvp, nch->ncp->nc_name);
fad57d0e 1719 }
1142bff7
MD
1720 cache_drop(nch);
1721 /* nch was NULLed out, reload mount */
1722 nch->mount = dvp->v_mount;
fad57d0e 1723 }
28623bf9
MD
1724
1725 /*
f63911bf 1726 * If nch->ncp is non-NULL it will have been held already.
28623bf9 1727 */
f63911bf
MD
1728 if (fakename)
1729 kfree(fakename, M_TEMP);
cc4c3b52
MD
1730 if (saved_dvp)
1731 vrele(saved_dvp);
28623bf9
MD
1732 if (nch->ncp)
1733 return (0);
1734 return (EINVAL);
fad57d0e
MD
1735}
1736
1737/*
a0d57516
MD
1738 * Go up the chain of parent directories until we find something
1739 * we can resolve into the namecache. This is very inefficient.
1740 */
1741static
1742int
1743cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
cc4c3b52 1744 struct vnode **saved_dvp)
a0d57516 1745{
28623bf9 1746 struct nchandle nch;
a0d57516
MD
1747 struct vnode *pvp;
1748 int error;
1749 static time_t last_fromdvp_report;
33387738 1750 char *fakename;
a0d57516
MD
1751
1752 /*
1753 * Loop getting the parent directory vnode until we get something we
1754 * can resolve in the namecache.
1755 */
1756 vref(dvp);
28623bf9 1757 nch.mount = dvp->v_mount;
1142bff7 1758 nch.ncp = NULL;
33387738 1759 fakename = NULL;
28623bf9 1760
a0d57516 1761 for (;;) {
33387738
MD
1762 if (fakename) {
1763 kfree(fakename, M_TEMP);
1764 fakename = NULL;
1765 }
1766 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
1767 &fakename);
a0d57516
MD
1768 if (error) {
1769 vrele(dvp);
33387738 1770 break;
a0d57516 1771 }
a11aaa81 1772 vn_unlock(pvp);
287a8577 1773 spin_lock(&pvp->v_spinlock);
28623bf9
MD
1774 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
1775 _cache_hold(nch.ncp);
287a8577 1776 spin_unlock(&pvp->v_spinlock);
a0d57516
MD
1777 vrele(pvp);
1778 break;
1779 }
287a8577 1780 spin_unlock(&pvp->v_spinlock);
a0d57516 1781 if (pvp->v_flag & VROOT) {
28623bf9
MD
1782 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
1783 error = cache_resolve_mp(nch.mount);
1784 _cache_unlock(nch.ncp);
a0d57516
MD
1785 vrele(pvp);
1786 if (error) {
28623bf9 1787 _cache_drop(nch.ncp);
1142bff7 1788 nch.ncp = NULL;
a0d57516 1789 vrele(dvp);
a0d57516
MD
1790 }
1791 break;
1792 }
1793 vrele(dvp);
1794 dvp = pvp;
1795 }
33387738
MD
1796 if (error == 0) {
1797 if (last_fromdvp_report != time_second) {
1798 last_fromdvp_report = time_second;
1799 kprintf("Warning: extremely inefficient path "
1800 "resolution on %s\n",
1801 nch.ncp->nc_name);
1802 }
1803 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
cc4c3b52 1804
33387738
MD
1805 /*
1806 * Hopefully dvp now has a namecache record associated with
1807 * it. Leave it referenced to prevent the kernel from
1808 * recycling the vnode. Otherwise extremely long directory
1809 * paths could result in endless recycling.
1810 */
1811 if (*saved_dvp)
1812 vrele(*saved_dvp);
1813 *saved_dvp = dvp;
1142bff7 1814 _cache_drop(nch.ncp);
33387738
MD
1815 }
1816 if (fakename)
1817 kfree(fakename, M_TEMP);
a0d57516
MD
1818 return (error);
1819}
1820
a0d57516 1821/*
fad57d0e
MD
1822 * Do an inefficient scan of the directory represented by ncp looking for
1823 * the directory vnode dvp. ncp must be held but not locked on entry and
1824 * will be held on return. dvp must be refd but not locked on entry and
1825 * will remain refd on return.
1826 *
1827 * Why do this at all? Well, due to its stateless nature the NFS server
1828 * converts file handles directly to vnodes without necessarily going through
1829 * the namecache ops that would otherwise create the namecache topology
1830 * leading to the vnode. We could either (1) Change the namecache algorithms
1831 * to allow disconnect namecache records that are re-merged opportunistically,
1832 * or (2) Make the NFS server backtrack and scan to recover a connected
1833 * namecache topology in order to then be able to issue new API lookups.
1834 *
1835 * It turns out that (1) is a huge mess. It takes a nice clean set of
1836 * namecache algorithms and introduces a lot of complication in every subsystem
1837 * that calls into the namecache to deal with the re-merge case, especially
1838 * since we are using the namecache to placehold negative lookups and the
1839 * vnode might not be immediately assigned. (2) is certainly far less
1840 * efficient then (1), but since we are only talking about directories here
1841 * (which are likely to remain cached), the case does not actually run all
1842 * that often and has the supreme advantage of not polluting the namecache
1843 * algorithms.
33387738
MD
1844 *
1845 * If a fakename is supplied just construct a namecache entry using the
1846 * fake name.
fad57d0e
MD
1847 */
1848static int
28623bf9 1849cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
33387738 1850 struct vnode *dvp, char *fakename)
fad57d0e
MD
1851{
1852 struct nlcomponent nlc;
28623bf9 1853 struct nchandle rncp;
fad57d0e
MD
1854 struct dirent *den;
1855 struct vnode *pvp;
1856 struct vattr vat;
1857 struct iovec iov;
1858 struct uio uio;
fad57d0e
MD
1859 int blksize;
1860 int eofflag;
4d22f42a 1861 int bytes;
fad57d0e
MD
1862 char *rbuf;
1863 int error;
fad57d0e
MD
1864
1865 vat.va_blocksize = 0;
87de5057 1866 if ((error = VOP_GETATTR(dvp, &vat)) != 0)
fad57d0e 1867 return (error);
2247fe02
MD
1868 cache_lock(nch);
1869 error = cache_vref(nch, cred, &pvp);
1870 cache_unlock(nch);
1871 if (error)
fad57d0e 1872 return (error);
973c11b9
MD
1873 if (ncvp_debug) {
1874 kprintf("inefficient_scan: directory iosize %ld "
1875 "vattr fileid = %lld\n",
1876 vat.va_blocksize,
1877 (long long)vat.va_fileid);
1878 }
33387738
MD
1879
1880 /*
1881 * Use the supplied fakename if not NULL. Fake names are typically
1882 * not in the actual filesystem hierarchy. This is used by HAMMER
1883 * to glue @@timestamp recursions together.
1884 */
1885 if (fakename) {
1886 nlc.nlc_nameptr = fakename;
1887 nlc.nlc_namelen = strlen(fakename);
1888 rncp = cache_nlookup(nch, &nlc);
1889 goto done;
1890 }
1891
fad57d0e
MD
1892 if ((blksize = vat.va_blocksize) == 0)
1893 blksize = DEV_BSIZE;
efda3bd0 1894 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
28623bf9 1895 rncp.ncp = NULL;
fad57d0e
MD
1896
1897 eofflag = 0;
1898 uio.uio_offset = 0;
fad57d0e 1899again:
fad57d0e
MD
1900 iov.iov_base = rbuf;
1901 iov.iov_len = blksize;
1902 uio.uio_iov = &iov;
1903 uio.uio_iovcnt = 1;
1904 uio.uio_resid = blksize;
1905 uio.uio_segflg = UIO_SYSSPACE;
1906 uio.uio_rw = UIO_READ;
1907 uio.uio_td = curthread;
1908
fad57d0e 1909 if (ncvp_debug >= 2)
6ea70f76 1910 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
4d22f42a 1911 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
fad57d0e 1912 if (error == 0) {
4d22f42a
MD
1913 den = (struct dirent *)rbuf;
1914 bytes = blksize - uio.uio_resid;
1915
1916 while (bytes > 0) {
1917 if (ncvp_debug >= 2) {
6ea70f76 1918 kprintf("cache_inefficient_scan: %*.*s\n",
4d22f42a
MD
1919 den->d_namlen, den->d_namlen,
1920 den->d_name);
1921 }
fad57d0e 1922 if (den->d_type != DT_WHT &&
01f31ab3 1923 den->d_ino == vat.va_fileid) {
4d22f42a 1924 if (ncvp_debug) {
6ea70f76 1925 kprintf("cache_inefficient_scan: "
50626622 1926 "MATCHED inode %lld path %s/%*.*s\n",
973c11b9
MD
1927 (long long)vat.va_fileid,
1928 nch->ncp->nc_name,
4d22f42a
MD
1929 den->d_namlen, den->d_namlen,
1930 den->d_name);
1931 }
fad57d0e
MD
1932 nlc.nlc_nameptr = den->d_name;
1933 nlc.nlc_namelen = den->d_namlen;
28623bf9
MD
1934 rncp = cache_nlookup(nch, &nlc);
1935 KKASSERT(rncp.ncp != NULL);
fad57d0e
MD
1936 break;
1937 }
01f31ab3
JS
1938 bytes -= _DIRENT_DIRSIZ(den);
1939 den = _DIRENT_NEXT(den);
fad57d0e 1940 }
28623bf9 1941 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
fad57d0e
MD
1942 goto again;
1943 }
33387738
MD
1944 kfree(rbuf, M_TEMP);
1945done:
885ecb13 1946 vrele(pvp);
28623bf9
MD
1947 if (rncp.ncp) {
1948 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
4b5bbb78 1949 _cache_setvp(rncp.mount, rncp.ncp, dvp);
fad57d0e 1950 if (ncvp_debug >= 2) {
6ea70f76 1951 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
28623bf9 1952 nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
fad57d0e
MD
1953 }
1954 } else {
1955 if (ncvp_debug >= 2) {
6ea70f76 1956 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
28623bf9
MD
1957 nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
1958 rncp.ncp->nc_vp);
fad57d0e
MD
1959 }
1960 }
28623bf9
MD
1961 if (rncp.ncp->nc_vp == NULL)
1962 error = rncp.ncp->nc_error;
1142bff7
MD
1963 /*
1964 * Release rncp after a successful nlookup. rncp was fully
1965 * referenced.
1966 */
1967 cache_put(&rncp);
fad57d0e 1968 } else {
6ea70f76 1969 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
28623bf9 1970 dvp, nch->ncp->nc_name);
fad57d0e
MD
1971 error = ENOENT;
1972 }
fad57d0e
MD
1973 return (error);
1974}
1975
1976/*
67773eb3
MD
1977 * Zap a namecache entry. The ncp is unconditionally set to an unresolved
1978 * state, which disassociates it from its vnode or ncneglist.
7ea21ed1 1979 *
67773eb3 1980 * Then, if there are no additional references to the ncp and no children,
f63911bf 1981 * the ncp is removed from the topology and destroyed.
7ea21ed1 1982 *
67773eb3
MD
1983 * References and/or children may exist if the ncp is in the middle of the
1984 * topology, preventing the ncp from being destroyed.
7ea21ed1 1985 *
67773eb3
MD
1986 * This function must be called with the ncp held and locked and will unlock
1987 * and drop it during zapping.
f63911bf 1988 *
65870584
MD
1989 * If nonblock is non-zero and the parent ncp cannot be locked we give up.
1990 * This case can occur in the cache_drop() path.
1991 *
f63911bf
MD
1992 * This function may returned a held (but NOT locked) parent node which the
1993 * caller must drop. We do this so _cache_drop() can loop, to avoid
1994 * blowing out the kernel stack.
1995 *
1996 * WARNING! For MPSAFE operation this routine must acquire up to three
1997 * spin locks to be able to safely test nc_refs. Lock order is
1998 * very important.
1999 *
2000 * hash spinlock if on hash list
2001 * parent spinlock if child of parent
2002 * (the ncp is unresolved so there is no vnode association)
984263bc 2003 */
f63911bf 2004static struct namecache *
65870584 2005cache_zap(struct namecache *ncp, int nonblock)
984263bc 2006{
7ea21ed1 2007 struct namecache *par;
f63911bf 2008 struct vnode *dropvp;
f63911bf 2009 int refs;
7ea21ed1
MD
2010
2011 /*
2012 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2013 */
28623bf9 2014 _cache_setunresolved(ncp);
7ea21ed1
MD
2015
2016 /*
2017 * Try to scrap the entry and possibly tail-recurse on its parent.
2018 * We only scrap unref'd (other then our ref) unresolved entries,
2019 * we do not scrap 'live' entries.
f63911bf
MD
2020 *
2021 * Note that once the spinlocks are acquired if nc_refs == 1 no
2022 * other references are possible. If it isn't, however, we have
2023 * to decrement but also be sure to avoid a 1->0 transition.
7ea21ed1 2024 */
f63911bf
MD
2025 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2026 KKASSERT(ncp->nc_refs > 0);
7ea21ed1 2027
f63911bf 2028 /*
65870584
MD
2029 * Acquire locks. Note that the parent can't go away while we hold
2030 * a child locked.
f63911bf 2031 */
2247fe02 2032 if ((par = ncp->nc_parent) != NULL) {
65870584
MD
2033 if (nonblock) {
2034 for (;;) {
2035 if (_cache_lock_nonblock(par) == 0)
2036 break;
65870584
MD
2037 refs = ncp->nc_refs;
2038 ncp->nc_flag |= NCF_DEFEREDZAP;
2039 ++numdefered; /* MP race ok */
2040 if (atomic_cmpset_int(&ncp->nc_refs,
2041 refs, refs - 1)) {
2042 _cache_unlock(ncp);
2043 return(NULL);
2044 }
2045 cpu_pause();
2046 }
2047 _cache_hold(par);
2048 } else {
2049 _cache_hold(par);
2050 _cache_lock(par);
2051 }
287a8577 2052 spin_lock(&ncp->nc_head->spin);
f63911bf 2053 }
7ea21ed1 2054
f63911bf
MD
2055 /*
2056 * If someone other then us has a ref or we have children
2057 * we cannot zap the entry. The 1->0 transition and any
2058 * further list operation is protected by the spinlocks
2059 * we have acquired but other transitions are not.
2060 */
2061 for (;;) {
2062 refs = ncp->nc_refs;
2063 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
2064 break;
2065 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
2247fe02 2066 if (par) {
287a8577 2067 spin_unlock(&ncp->nc_head->spin);
2247fe02
MD
2068 _cache_put(par);
2069 }
f63911bf
MD
2070 _cache_unlock(ncp);
2071 return(NULL);
7ea21ed1 2072 }
2247fe02 2073 cpu_pause();
f63911bf 2074 }
67773eb3 2075
f63911bf
MD
2076 /*
2077 * We are the only ref and with the spinlocks held no further
2078 * refs can be acquired by others.
2079 *
2080 * Remove us from the hash list and parent list. We have to
2081 * drop a ref on the parent's vp if the parent's list becomes
2082 * empty.
2083 */
f63911bf 2084 dropvp = NULL;
2247fe02
MD
2085 if (par) {
2086 struct nchash_head *nchpp = ncp->nc_head;
67773eb3 2087
2247fe02
MD
2088 KKASSERT(nchpp != NULL);
2089 LIST_REMOVE(ncp, nc_hash);
2090 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
f63911bf
MD
2091 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
2092 dropvp = par->nc_vp;
2247fe02
MD
2093 ncp->nc_head = NULL;
2094 ncp->nc_parent = NULL;
287a8577 2095 spin_unlock(&nchpp->spin);
2247fe02
MD
2096 _cache_unlock(par);
2097 } else {
2098 KKASSERT(ncp->nc_head == NULL);
7ea21ed1 2099 }
f63911bf
MD
2100
2101 /*
2102 * ncp should not have picked up any refs. Physically
2103 * destroy the ncp.
2104 */
f63911bf 2105 KKASSERT(ncp->nc_refs == 1);
f63911bf
MD
2106 /* _cache_unlock(ncp) not required */
2107 ncp->nc_refs = -1; /* safety */
2108 if (ncp->nc_name)
2109 kfree(ncp->nc_name, M_VFSCACHE);
2110 kfree(ncp, M_VFSCACHE);
2111
2112 /*
2113 * Delayed drop (we had to release our spinlocks)
2114 *
2115 * The refed parent (if not NULL) must be dropped. The
2116 * caller is responsible for looping.
2117 */
2118 if (dropvp)
2119 vdrop(dropvp);
2120 return(par);
984263bc
MD
2121}
2122
65870584
MD
2123/*
2124 * Clean up dangling negative cache and defered-drop entries in the
2125 * namecache.
2126 */
9e10d70b
MD
2127typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2128
2129static cache_hs_t neg_cache_hysteresis_state = CHI_LOW;
2130static cache_hs_t pos_cache_hysteresis_state = CHI_LOW;
62d0f1f0 2131
62d0f1f0 2132void
65870584 2133cache_hysteresis(void)
62d0f1f0 2134{
9e10d70b
MD
2135 int poslimit;
2136
62d0f1f0
MD
2137 /*
2138 * Don't cache too many negative hits. We use hysteresis to reduce
2139 * the impact on the critical path.
2140 */
9e10d70b 2141 switch(neg_cache_hysteresis_state) {
62d0f1f0
MD
2142 case CHI_LOW:
2143 if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
65870584 2144 _cache_cleanneg(10);
9e10d70b 2145 neg_cache_hysteresis_state = CHI_HIGH;
62d0f1f0
MD
2146 }
2147 break;
2148 case CHI_HIGH:
2149 if (numneg > MINNEG * 9 / 10 &&
2150 numneg * ncnegfactor * 9 / 10 > numcache
2151 ) {
65870584 2152 _cache_cleanneg(10);
62d0f1f0 2153 } else {
9e10d70b
MD
2154 neg_cache_hysteresis_state = CHI_LOW;
2155 }
2156 break;
2157 }
2158
2159 /*
2160 * Don't cache too many positive hits. We use hysteresis to reduce
2161 * the impact on the critical path.
2162 *
2163 * Excessive positive hits can accumulate due to large numbers of
2164 * hardlinks (the vnode cache will not prevent hl ncps from growing
2165 * into infinity).
2166 */
2167 if ((poslimit = ncposlimit) == 0)
2168 poslimit = desiredvnodes * 2;
2169
2170 switch(pos_cache_hysteresis_state) {
2171 case CHI_LOW:
2172 if (numcache > poslimit && numcache > MINPOS) {
2173 _cache_cleanpos(10);
2174 pos_cache_hysteresis_state = CHI_HIGH;
2175 }
2176 break;
2177 case CHI_HIGH:
2178 if (numcache > poslimit * 5 / 6 && numcache > MINPOS) {
2179 _cache_cleanpos(10);
2180 } else {
2181 pos_cache_hysteresis_state = CHI_LOW;
62d0f1f0
MD
2182 }
2183 break;
2184 }
65870584
MD
2185
2186 /*
2187 * Clean out dangling defered-zap ncps which could not
2188 * be cleanly dropped if too many build up. Note
2189 * that numdefered is not an exact number as such ncps
2190 * can be reused and the counter is not handled in a MP
2191 * safe manner by design.
2192 */
2193 if (numdefered * ncnegfactor > numcache) {
2194 _cache_cleandefered();
2195 }
62d0f1f0
MD
2196}
2197
984263bc 2198/*
14c92d03
MD
2199 * NEW NAMECACHE LOOKUP API
2200 *
2247fe02
MD
2201 * Lookup an entry in the namecache. The passed par_nch must be referenced
2202 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp
2203 * is ALWAYS returned, eve if the supplied component is illegal.
2204 *
fad57d0e 2205 * The resulting namecache entry should be returned to the system with
2247fe02 2206 * cache_put() or cache_unlock() + cache_drop().
14c92d03
MD
2207 *
2208 * namecache locks are recursive but care must be taken to avoid lock order
2247fe02
MD
2209 * reversals (hence why the passed par_nch must be unlocked). Locking
2210 * rules are to order for parent traversals, not for child traversals.
14c92d03
MD
2211 *
2212 * Nobody else will be able to manipulate the associated namespace (e.g.
2213 * create, delete, rename, rename-target) until the caller unlocks the
2214 * entry.
2215 *
2216 * The returned entry will be in one of three states: positive hit (non-null
2217 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2218 * Unresolved entries must be resolved through the filesystem to associate the
2219 * vnode and/or determine whether a positive or negative hit has occured.
2220 *
2221 * It is not necessary to lock a directory in order to lock namespace under
2222 * that directory. In fact, it is explicitly not allowed to do that. A
2223 * directory is typically only locked when being created, renamed, or
2224 * destroyed.
2225 *
2226 * The directory (par) may be unresolved, in which case any returned child
2227 * will likely also be marked unresolved. Likely but not guarenteed. Since
fad57d0e
MD
2228 * the filesystem lookup requires a resolved directory vnode the caller is
2229 * responsible for resolving the namecache chain top-down. This API
14c92d03
MD
2230 * specifically allows whole chains to be created in an unresolved state.
2231 */
28623bf9
MD
2232struct nchandle
2233cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
14c92d03 2234{
28623bf9 2235 struct nchandle nch;
690a3127
MD
2236 struct namecache *ncp;
2237 struct namecache *new_ncp;
f63911bf 2238 struct nchash_head *nchpp;
4b5bbb78 2239 struct mount *mp;
690a3127
MD
2240 u_int32_t hash;
2241 globaldata_t gd;
2247fe02 2242 int par_locked;
690a3127
MD
2243
2244 numcalls++;
2245 gd = mycpu;
4b5bbb78 2246 mp = par_nch->mount;
2247fe02
MD
2247 par_locked = 0;
2248
2249 /*
2250 * This is a good time to call it, no ncp's are locked by
2251 * the caller or us.
2252 */
65870584 2253 cache_hysteresis();
690a3127
MD
2254
2255 /*
690a3127
MD
2256 * Try to locate an existing entry
2257 */
2258 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
28623bf9 2259 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
690a3127 2260 new_ncp = NULL;
f63911bf 2261 nchpp = NCHHASH(hash);
690a3127 2262restart:
287a8577 2263 spin_lock(&nchpp->spin);
f63911bf 2264 LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
690a3127
MD
2265 numchecks++;
2266
2267 /*
690a3127 2268 * Break out if we find a matching entry. Note that
e09206ba
MD
2269 * UNRESOLVED entries may match, but DESTROYED entries
2270 * do not.
690a3127 2271 */
28623bf9 2272 if (ncp->nc_parent == par_nch->ncp &&
690a3127 2273 ncp->nc_nlen == nlc->nlc_namelen &&
e09206ba
MD
2274 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2275 (ncp->nc_flag & NCF_DESTROYED) == 0
690a3127 2276 ) {
f63911bf 2277 _cache_hold(ncp);
287a8577 2278 spin_unlock(&nchpp->spin);
2247fe02
MD
2279 if (par_locked) {
2280 _cache_unlock(par_nch->ncp);
2281 par_locked = 0;
2282 }
2283 if (_cache_lock_special(ncp) == 0) {
4b5bbb78 2284 _cache_auto_unresolve(mp, ncp);
67773eb3 2285 if (new_ncp)
28623bf9 2286 _cache_free(new_ncp);
67773eb3
MD
2287 goto found;
2288 }
28623bf9
MD
2289 _cache_get(ncp);
2290 _cache_put(ncp);
f63911bf 2291 _cache_drop(ncp);
67773eb3 2292 goto restart;
690a3127
MD
2293 }
2294 }
2295
2296 /*
2297 * We failed to locate an entry, create a new entry and add it to
2247fe02
MD
2298 * the cache. The parent ncp must also be locked so we
2299 * can link into it.
2300 *
2301 * We have to relookup after possibly blocking in kmalloc or
2302 * when locking par_nch.
2303 *
2304 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2305 * mount case, in which case nc_name will be NULL.
690a3127
MD
2306 */
2307 if (new_ncp == NULL) {
287a8577 2308 spin_unlock(&nchpp->spin);
524c845c 2309 new_ncp = cache_alloc(nlc->nlc_namelen);
2247fe02
MD
2310 if (nlc->nlc_namelen) {
2311 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2312 nlc->nlc_namelen);
2313 new_ncp->nc_name[nlc->nlc_namelen] = 0;
2314 }
2315 goto restart;
2316 }
2317 if (par_locked == 0) {
287a8577 2318 spin_unlock(&nchpp->spin);
2247fe02
MD
2319 _cache_lock(par_nch->ncp);
2320 par_locked = 1;
690a3127
MD
2321 goto restart;
2322 }
690a3127
MD
2323
2324 /*
2247fe02 2325 * WARNING! We still hold the spinlock. We have to set the hash
668b43c5 2326 * table entry atomically.
690a3127 2327 */
2247fe02
MD
2328 ncp = new_ncp;
2329 _cache_link_parent(ncp, par_nch->ncp, nchpp);
287a8577 2330 spin_unlock(&nchpp->spin);
2247fe02
MD
2331 _cache_unlock(par_nch->ncp);
2332 /* par_locked = 0 - not used */
690a3127 2333found:
fad57d0e
MD
2334 /*
2335 * stats and namecache size management
2336 */
2337 if (ncp->nc_flag & NCF_UNRESOLVED)
2338 ++gd->gd_nchstats->ncs_miss;
2339 else if (ncp->nc_vp)
2340 ++gd->gd_nchstats->ncs_goodhits;
2341 else
2342 ++gd->gd_nchstats->ncs_neghits;
4b5bbb78 2343 nch.mount = mp;
28623bf9 2344 nch.ncp = ncp;
61f96b6f 2345 atomic_add_int(&nch.mount->mnt_refs, 1);
28623bf9 2346 return(nch);
690a3127
MD
2347}
2348
2349/*
668b43c5
MD
2350 * This is a non-blocking verison of cache_nlookup() used by
2351 * nfs_readdirplusrpc_uio(). It can fail for any reason and
2352 * will return nch.ncp == NULL in that case.
2353 */
2354struct nchandle
2355cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
2356{
2357 struct nchandle nch;
2358 struct namecache *ncp;
2359 struct namecache *new_ncp;
2360 struct nchash_head *nchpp;
2361 struct mount *mp;
2362 u_int32_t hash;
2363 globaldata_t gd;
2364 int par_locked;
2365
2366 numcalls++;
2367 gd = mycpu;
2368 mp = par_nch->mount;
2369 par_locked = 0;
2370
2371 /*
2372 * Try to locate an existing entry
2373 */
2374 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2375 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2376 new_ncp = NULL;
2377 nchpp = NCHHASH(hash);
2378restart:
287a8577 2379 spin_lock(&nchpp->spin);
668b43c5
MD
2380 LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2381 numchecks++;
2382
2383 /*
2384 * Break out if we find a matching entry. Note that
2385 * UNRESOLVED entries may match, but DESTROYED entries
2386 * do not.
2387 */
2388 if (ncp->nc_parent == par_nch->ncp &&
2389 ncp->nc_nlen == nlc->nlc_namelen &&
2390 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2391 (ncp->nc_flag & NCF_DESTROYED) == 0
2392 ) {
2393 _cache_hold(ncp);
287a8577 2394 spin_unlock(&nchpp->spin);
668b43c5
MD
2395 if (par_locked) {
2396 _cache_unlock(par_nch->ncp);
2397 par_locked = 0;
2398 }
2399 if (_cache_lock_special(ncp) == 0) {
2400 _cache_auto_unresolve(mp, ncp);
2401 if (new_ncp) {
2402 _cache_free(new_ncp);
2403 new_ncp = NULL;
2404 }
2405 goto found;
2406 }
2407 _cache_drop(ncp);
2408 goto failed;
2409 }
2410 }
2411
2412 /*
2413 * We failed to locate an entry, create a new entry and add it to
2414 * the cache. The parent ncp must also be locked so we
2415 * can link into it.
2416 *
2417 * We have to relookup after possibly blocking in kmalloc or
2418 * when locking par_nch.
2419 *
2420 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2421 * mount case, in which case nc_name will be NULL.
2422 */
2423 if (new_ncp == NULL) {
287a8577 2424 spin_unlock(&nchpp->spin);
668b43c5
MD
2425 new_ncp = cache_alloc(nlc->nlc_namelen);
2426 if (nlc->nlc_namelen) {
2427 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2428 nlc->nlc_namelen);
2429 new_ncp->nc_name[nlc->nlc_namelen] = 0;
2430 }
2431 goto restart;
2432 }
2433 if (par_locked == 0) {
287a8577 2434 spin_unlock(&nchpp->spin);
668b43c5
MD
2435 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
2436 par_locked = 1;
2437 goto restart;
2438 }
2439 goto failed;
2440 }
2441
2442 /*
2443 * WARNING! We still hold the spinlock. We have to set the hash
2444 * table entry atomically.
2445 */
2446 ncp = new_ncp;
2447 _cache_link_parent(ncp, par_nch->ncp, nchpp);
287a8577 2448 spin_unlock(&nchpp->spin);
668b43c5
MD
2449 _cache_unlock(par_nch->ncp);
2450 /* par_locked = 0 - not used */
2451found:
2452 /*
2453 * stats and namecache size management
2454 */
2455 if (ncp->nc_flag & NCF_UNRESOLVED)
2456 ++gd->gd_nchstats->ncs_miss;
2457 else if (ncp->nc_vp)
2458 ++gd->gd_nchstats->ncs_goodhits;
2459 else
2460 ++gd->gd_nchstats->ncs_neghits;
2461 nch.mount = mp;
2462 nch.ncp = ncp;
2463 atomic_add_int(&nch.mount->mnt_refs, 1);
2464 return(nch);
2465failed:
2466 if (new_ncp) {
2467 _cache_free(new_ncp);
2468 new_ncp = NULL;
2469 }
2470 nch.mount = NULL;
2471 nch.ncp = NULL;
2472 return(nch);
2473}
2474
2475/*
28623bf9
MD
2476 * The namecache entry is marked as being used as a mount point.
2477 * Locate the mount if it is visible to the caller.
1d505369 2478 */
28623bf9
MD
2479struct findmount_info {
2480 struct mount *result;
2481 struct mount *nch_mount;
2482 struct namecache *nch_ncp;
2483};
2484
2485static
2486int
2487cache_findmount_callback(struct mount *mp, void *data)
1d505369 2488{
28623bf9 2489 struct findmount_info *info = data;
1d505369 2490
28623bf9
MD
2491 /*
2492 * Check the mount's mounted-on point against the passed nch.
2493 */
2494 if (mp->mnt_ncmounton.mount == info->nch_mount &&
2495 mp->mnt_ncmounton.ncp == info->nch_ncp
2496 ) {
2497 info->result = mp;
2498 return(-1);
1d505369 2499 }
28623bf9 2500 return(0);
1d505369
MD
2501}
2502
28623bf9
MD
2503struct mount *
2504cache_findmount(struct nchandle *nch)
9b1b3591 2505{
28623bf9
MD
2506 struct findmount_info info;
2507
2508 info.result = NULL;
2509 info.nch_mount = nch->mount;
2510 info.nch_ncp = nch->ncp;
2511 mountlist_scan(cache_findmount_callback, &info,
2512 MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
2513 return(info.result);
9b1b3591
MD
2514}
2515
2516/*
21739618 2517 * Resolve an unresolved namecache entry, generally by looking it up.
67773eb3 2518 * The passed ncp must be locked and refd.
21739618
MD
2519 *
2520 * Theoretically since a vnode cannot be recycled while held, and since
2521 * the nc_parent chain holds its vnode as long as children exist, the
2522 * direct parent of the cache entry we are trying to resolve should
2523 * have a valid vnode. If not then generate an error that we can
2524 * determine is related to a resolver bug.
fad57d0e 2525 *
9b1b3591
MD
2526 * However, if a vnode was in the middle of a recyclement when the NCP
2527 * got locked, ncp->nc_vp might point to a vnode that is about to become
2528 * invalid. cache_resolve() handles this case by unresolving the entry
2529 * and then re-resolving it.
2530 *
fad57d0e
MD
2531 * Note that successful resolution does not necessarily return an error
2532 * code of 0. If the ncp resolves to a negative cache hit then ENOENT
2533 * will be returned.
2247fe02
MD
2534 *
2535 * MPSAFE
690a3127
MD
2536 */
2537int
28623bf9 2538cache_resolve(struct nchandle *nch, struct ucred *cred)
690a3127 2539{
2247fe02 2540 struct namecache *par_tmp;
21739618 2541 struct namecache *par;
28623bf9
MD
2542 struct namecache *ncp;
2543 struct nchandle nctmp;
2544 struct mount *mp;
dff430ab 2545 struct vnode *dvp;
67773eb3 2546 int error;
8e005a45 2547
28623bf9
MD
2548 ncp = nch->ncp;
2549 mp = nch->mount;
67773eb3 2550restart:
8e005a45 2551 /*
9b1b3591
MD
2552 * If the ncp is already resolved we have nothing to do. However,
2553 * we do want to guarentee that a usable vnode is returned when
2554 * a vnode is present, so make sure it hasn't been reclaimed.
8e005a45 2555 */
9b1b3591
MD
2556 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
2557 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
28623bf9 2558 _cache_setunresolved(ncp);
9b1b3591
MD
2559 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
2560 return (ncp->nc_error);
2561 }
21739618 2562
646a1cda
MD
2563 /*
2564 * Mount points need special handling because the parent does not
2565 * belong to the same filesystem as the ncp.
2566 */
28623bf9
MD
2567 if (ncp == mp->mnt_ncmountpt.ncp)
2568 return (cache_resolve_mp(mp));
646a1cda
MD
2569
2570 /*
2571 * We expect an unbroken chain of ncps to at least the mount point,
2572 * and even all the way to root (but this code doesn't have to go
2573 * past the mount point).
2574 */
2575 if (ncp->nc_parent == NULL) {
6ea70f76 2576 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
646a1cda 2577 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
21739618 2578 ncp->nc_error = EXDEV;
646a1cda
MD
2579 return(ncp->nc_error);
2580 }
2581
2582 /*
2583 * The vp's of the parent directories in the chain are held via vhold()
2584 * due to the existance of the child, and should not disappear.
2585 * However, there are cases where they can disappear:
2586 *
2587 * - due to filesystem I/O errors.
2588 * - due to NFS being stupid about tracking the namespace and
2589 * destroys the namespace for entire directories quite often.
2590 * - due to forced unmounts.
e09206ba 2591 * - due to an rmdir (parent will be marked DESTROYED)
646a1cda
MD
2592 *
2593 * When this occurs we have to track the chain backwards and resolve
2594 * it, looping until the resolver catches up to the current node. We
2595 * could recurse here but we might run ourselves out of kernel stack
2596 * so we do it in a more painful manner. This situation really should
2597 * not occur all that often, or if it does not have to go back too
2598 * many nodes to resolve the ncp.
2599 */
5312fa43 2600 while ((dvp = cache_dvpref(ncp)) == NULL) {
e09206ba
MD
2601 /*
2602 * This case can occur if a process is CD'd into a
2603 * directory which is then rmdir'd. If the parent is marked
2604 * destroyed there is no point trying to resolve it.
2605 */
2606 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
2607 return(ENOENT);
646a1cda 2608 par = ncp->nc_parent;
2247fe02
MD
2609 _cache_hold(par);
2610 _cache_lock(par);
2611 while ((par_tmp = par->nc_parent) != NULL &&
2612 par_tmp->nc_vp == NULL) {
2613 _cache_hold(par_tmp);
2614 _cache_lock(par_tmp);
2615 _cache_put(par);
2616 par = par_tmp;
2617 }
646a1cda 2618 if (par->nc_parent == NULL) {
6ea70f76 2619 kprintf("EXDEV case 2 %*.*s\n",
646a1cda 2620 par->nc_nlen, par->nc_nlen, par->nc_name);
2247fe02 2621 _cache_put(par);
646a1cda
MD
2622 return (EXDEV);
2623 }
6ea70f76 2624 kprintf("[diagnostic] cache_resolve: had to recurse on %*.*s\n",
646a1cda
MD
2625 par->nc_nlen, par->nc_nlen, par->nc_name);
2626 /*
67773eb3
MD
2627 * The parent is not set in stone, ref and lock it to prevent
2628 * it from disappearing. Also note that due to renames it
2629 * is possible for our ncp to move and for par to no longer
2630 * be one of its parents. We resolve it anyway, the loop
2631 * will handle any moves.
646a1cda 2632 */
2247fe02
MD
2633 _cache_get(par); /* additional hold/lock */
2634 _cache_put(par); /* from earlier hold/lock */
28623bf9
MD
2635 if (par == nch->mount->mnt_ncmountpt.ncp) {
2636 cache_resolve_mp(nch->mount);
c0c70b27 2637 } else if ((dvp = cache_dvpref(par)) == NULL) {
6ea70f76 2638 kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
28623bf9 2639 _cache_put(par);
8e005a45 2640 continue;
c0c70b27
MD
2641 } else {
2642 if (par->nc_flag & NCF_UNRESOLVED) {
2643 nctmp.mount = mp;
2644 nctmp.ncp = par;
2645 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
2646 }
5312fa43 2647 vrele(dvp);
646a1cda 2648 }
67773eb3
MD
2649 if ((error = par->nc_error) != 0) {
2650 if (par->nc_error != EAGAIN) {
6ea70f76 2651 kprintf("EXDEV case 3 %*.*s error %d\n",
67773eb3
MD
2652 par->nc_nlen, par->nc_nlen, par->nc_name,
2653 par->nc_error);
28623bf9 2654 _cache_put(par);
67773eb3
MD
2655 return(error);
2656 }
6ea70f76 2657 kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
67773eb3 2658 par, par->nc_nlen, par->nc_nlen, par->nc_name);
646a1cda 2659 }
28623bf9 2660 _cache_put(par);
67773eb3 2661 /* loop */
646a1cda 2662 }
8e005a45
MD
2663
2664 /*
fad57d0e 2665 * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
8e005a45
MD
2666 * ncp's and reattach them. If this occurs the original ncp is marked
2667 * EAGAIN to force a relookup.
fad57d0e
MD
2668 *
2669 * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
2670 * ncp must already be resolved.
8e005a45 2671 */
5312fa43 2672 if (dvp) {
c0c70b27
MD
2673 nctmp.mount = mp;
2674 nctmp.ncp = ncp;
2675 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
5312fa43 2676 vrele(dvp);
c0c70b27
MD
2677 } else {
2678 ncp->nc_error = EPERM;
2679 }
67773eb3 2680 if (ncp->nc_error == EAGAIN) {
6ea70f76 2681 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
67773eb3
MD
2682 ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
2683 goto restart;
2684 }
646a1cda
MD
2685 return(ncp->nc_error);
2686}
2687
2688/*
2689 * Resolve the ncp associated with a mount point. Such ncp's almost always
2690 * remain resolved and this routine is rarely called. NFS MPs tends to force
2691 * re-resolution more often due to its mac-truck-smash-the-namecache
2692 * method of tracking namespace changes.
2693 *
6215aa92
MD
2694 * The semantics for this call is that the passed ncp must be locked on
2695 * entry and will be locked on return. However, if we actually have to
2696 * resolve the mount point we temporarily unlock the entry in order to
2697 * avoid race-to-root deadlocks due to e.g. dead NFS mounts. Because of
2698 * the unlock we have to recheck the flags after we relock.
646a1cda
MD
2699 */
2700static int
28623bf9 2701cache_resolve_mp(struct mount *mp)
646a1cda 2702{
28623bf9 2703 struct namecache *ncp = mp->mnt_ncmountpt.ncp;
646a1cda 2704 struct vnode *vp;
6215aa92 2705 int error;
646a1cda
MD
2706
2707 KKASSERT(mp != NULL);
9b1b3591
MD
2708
2709 /*
2710 * If the ncp is already resolved we have nothing to do. However,
2711 * we do want to guarentee that a usable vnode is returned when
2712 * a vnode is present, so make sure it hasn't been reclaimed.
2713 */
2714 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
2715 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
28623bf9 2716 _cache_setunresolved(ncp);
9b1b3591
MD
2717 }
2718
646a1cda 2719 if (ncp->nc_flag & NCF_UNRESOLVED) {
28623bf9 2720 _cache_unlock(ncp);
f9642f56 2721 while (vfs_busy(mp, 0))
646a1cda 2722 ;
6215aa92 2723 error = VFS_ROOT(mp, &vp);
28623bf9 2724 _cache_lock(ncp);
6215aa92
MD
2725
2726 /*
2727 * recheck the ncp state after relocking.
2728 */
2729 if (ncp->nc_flag & NCF_UNRESOLVED) {
2730 ncp->nc_error = error;
2731 if (error == 0) {
4b5bbb78 2732 _cache_setvp(mp, ncp, vp);
6215aa92
MD
2733 vput(vp);
2734 } else {
341a6e45
MD
2735 kprintf("[diagnostic] cache_resolve_mp: failed"
2736 " to resolve mount %p err=%d ncp=%p\n",
2737 mp, error, ncp);
4b5bbb78 2738 _cache_setvp(mp, ncp, NULL);
6215aa92
MD
2739 }
2740 } else if (error == 0) {
646a1cda 2741 vput(vp);
646a1cda 2742 }
f9642f56 2743 vfs_unbusy(mp);
21739618
MD
2744 }
2745 return(ncp->nc_error);
14c92d03
MD
2746}
2747
f63911bf 2748/*
65870584
MD
2749 * Clean out negative cache entries when too many have accumulated.
2750 *
f63911bf
MD
2751 * MPSAFE
2752 */
65870584
MD
2753static void
2754_cache_cleanneg(int count)
62d0f1f0
MD
2755{
2756 struct namecache *ncp;
7ea21ed1
MD
2757
2758 /*
62d0f1f0
MD
2759 * Attempt to clean out the specified number of negative cache
2760 * entries.
2761 */
2762 while (count) {
287a8577 2763 spin_lock(&ncspin);
7ea21ed1 2764 ncp = TAILQ_FIRST(&ncneglist);
eb82ae62 2765 if (ncp == NULL) {
287a8577 2766 spin_unlock(&ncspin);
eb82ae62
MD
2767 break;
2768 }
62d0f1f0
MD
2769 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
2770 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
f63911bf 2771 _cache_hold(ncp);
287a8577 2772 spin_unlock(&ncspin);
2247fe02 2773 if (_cache_lock_special(ncp) == 0) {
b8587f8c 2774 ncp = cache_zap(ncp, 1);
f63911bf
MD
2775 if (ncp)
2776 _cache_drop(ncp);
2777 } else {
2778 _cache_drop(ncp);
2779 }
62d0f1f0 2780 --count;
984263bc
MD
2781 }
2782}
2783
fad57d0e 2784/*
9e10d70b
MD
2785 * Clean out positive cache entries when too many have accumulated.
2786 *
2787 * MPSAFE
2788 */
2789static void
2790_cache_cleanpos(int count)
2791{
2792 static volatile int rover;
2793 struct nchash_head *nchpp;
2794 struct namecache *ncp;
2795 int rover_copy;
2796
2797 /*
2798 * Attempt to clean out the specified number of negative cache
2799 * entries.
2800 */
2801 while (count) {
2802 rover_copy = ++rover; /* MPSAFEENOUGH */
2803 nchpp = NCHHASH(rover_copy);
2804
2805 spin_lock(&nchpp->spin);
2806 ncp = LIST_FIRST(&nchpp->list);
2807 if (ncp)
2808 _cache_hold(ncp);
2809 spin_unlock(&nchpp->spin);
2810
2811 if (ncp) {
2812 if (_cache_lock_special(ncp) == 0) {
2813 ncp = cache_zap(ncp, 1);
2814 if (ncp)
2815 _cache_drop(ncp);
2816 } else {
2817 _cache_drop(ncp);
2818 }
2819 }
2820 --count;
2821 }
2822}
2823
2824/*
65870584
MD
2825 * This is a kitchen sink function to clean out ncps which we
2826 * tried to zap from cache_drop() but failed because we were
2827 * unable to acquire the parent lock.
2828 *
2829 * Such entries can also be removed via cache_inval_vp(), such
2830 * as when unmounting.
2831 *
2832 * MPSAFE
2833 */
2834static void
2835_cache_cleandefered(void)
2836{
2837 struct nchash_head *nchpp;
2838 struct namecache *ncp;
2839 struct namecache dummy;
2840 int i;
2841
055f5cc8 2842 numdefered = 0;
65870584
MD
2843 bzero(&dummy, sizeof(dummy));
2844 dummy.nc_flag = NCF_DESTROYED;
2845
2846 for (i = 0; i <= nchash; ++i) {
2847 nchpp = &nchashtbl[i];
2848
287a8577 2849 spin_lock(&nchpp->spin);
65870584
MD
2850 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
2851 ncp = &dummy;
2852 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) {
2853 if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
2854 continue;
2855 LIST_REMOVE(&dummy, nc_hash);
2856 LIST_INSERT_AFTER(ncp, &dummy, nc_hash);
2857 _cache_hold(ncp);
287a8577 2858 spin_unlock(&nchpp->spin);
055f5cc8
MD
2859 if (_cache_lock_nonblock(ncp) == 0) {
2860 ncp->nc_flag &= ~NCF_DEFEREDZAP;
2861 _cache_unlock(ncp);
2862 }
65870584 2863 _cache_drop(ncp);
287a8577 2864 spin_lock(&nchpp->spin);
65870584
MD
2865 ncp = &dummy;
2866 }
2867 LIST_REMOVE(&dummy, nc_hash);
287a8577 2868 spin_unlock(&nchpp->spin);
65870584
MD
2869 }
2870}
2871
2872/*
24e51f36 2873 * Name cache initialization, from vfsinit() when we are booting
984263bc
MD
2874 */
2875void
8987aad7 2876nchinit(void)
984263bc 2877{
24e51f36
HP
2878 int i;
2879 globaldata_t gd;
2880
2881 /* initialise per-cpu namecache effectiveness statistics. */
2882 for (i = 0; i < ncpus; ++i) {
2883 gd = globaldata_find(i);
2884 gd->gd_nchstats = &nchstats[i];
2885 }
7ea21ed1 2886 TAILQ_INIT(&ncneglist);
f63911bf 2887 spin_init(&ncspin);
9e10d70b
MD
2888 nchashtbl = hashinit_ext(desiredvnodes / 2,
2889 sizeof(struct nchash_head),
f63911bf
MD
2890 M_VFSCACHE, &nchash);
2891 for (i = 0; i <= (int)nchash; ++i) {
2892 LIST_INIT(&nchashtbl[i].list);
2893 spin_init(&nchashtbl[i].spin);
2894 }
17bde83a 2895 nclockwarn = 5 * hz;
21739618
MD
2896}
2897
2898/*
2899 * Called from start_init() to bootstrap the root filesystem. Returns
2900 * a referenced, unlocked namecache record.
2901 */
28623bf9
MD
2902void
2903cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
21739618 2904{
28623bf9
MD
2905 nch->ncp = cache_alloc(0);
2906 nch->mount = mp;
61f96b6f 2907 atomic_add_int(&mp->mnt_refs, 1);
28623bf9 2908 if (vp)
4b5bbb78 2909 _cache_setvp(nch->mount, nch->ncp, vp);
984263bc
MD
2910}
2911
2912/*
7ea21ed1 2913 * vfs_cache_setroot()
984263bc 2914 *
7ea21ed1
MD
2915 * Create an association between the root of our namecache and
2916 * the root vnode. This routine may be called several times during
2917 * booting.
690a3127
MD
2918 *
2919 * If the caller intends to save the returned namecache pointer somewhere
2920 * it must cache_hold() it.
7ea21ed1 2921 */
21739618 2922void
28623bf9 2923vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
7ea21ed1 2924{
21739618 2925 struct vnode *ovp;
28623bf9 2926 struct nchandle onch;
21739618
MD
2927
2928 ovp = rootvnode;
28623bf9 2929 onch = rootnch;
21739618 2930 rootvnode = nvp;
28623bf9
MD
2931 if (nch)
2932 rootnch = *nch;
2933 else
2934 cache_zero(&rootnch);
21739618
MD
2935 if (ovp)
2936 vrele(ovp);
28623bf9
MD
2937 if (onch.ncp)
2938 cache_drop(&onch);
7ea21ed1
MD
2939}
2940
2941/*
fad57d0e
MD
2942 * XXX OLD API COMPAT FUNCTION. This really messes up the new namecache
2943 * topology and is being removed as quickly as possible. The new VOP_N*()
2944 * API calls are required to make specific adjustments using the supplied
2945 * ncp pointers rather then just bogusly purging random vnodes.
2946 *
7ea21ed1
MD
2947 * Invalidate all namecache entries to a particular vnode as well as
2948 * any direct children of that vnode in the namecache. This is a
2949 * 'catch all' purge used by filesystems that do not know any better.
2950 *
7ea21ed1
MD
2951 * Note that the linkage between the vnode and its namecache entries will
2952 * be removed, but the namecache entries themselves might stay put due to
2953 * active references from elsewhere in the system or due to the existance of
2954 * the children. The namecache topology is left intact even if we do not
2955 * know what the vnode association is. Such entries will be marked
2956 * NCF_UNRESOLVED.
984263bc 2957 */
984263bc 2958void
8987aad7 2959cache_purge(struct vnode *vp)
984263bc 2960{
6b008938 2961 cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
984263bc
MD
2962}
2963
2964/*
2965 * Flush all entries referencing a particular filesystem.
2966 *
2967 * Since we need to check it anyway, we will flush all the invalid
2968 * entries at the same time.
2969 */
28623bf9
MD
2970#if 0
2971
984263bc 2972void
8987aad7 2973cache_purgevfs(struct mount *mp)
984263bc 2974{
f63911bf 2975 struct nchash_head *nchpp;
984263bc
MD
2976 struct namecache *ncp, *nnp;
2977
7ea21ed1
MD
2978 /*
2979 * Scan hash tables for applicable entries.
2980 */
bc0c094e 2981 for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
f63911bf
MD
2982 spin_lock_wr(&nchpp->spin); XXX
2983 ncp = LIST_FIRST(&nchpp->list);
7ea21ed1 2984 if (ncp)
28623bf9 2985 _cache_hold(ncp);
7ea21ed1 2986 while (ncp) {
984263bc 2987 nnp = LIST_NEXT(ncp, nc_hash);
7ea21ed1 2988 if (nnp)
28623bf9 2989 _cache_hold(nnp);
4fcb1cf7 2990 if (ncp->nc_mount == mp) {
28623bf9 2991 _cache_lock(ncp);
65870584 2992 ncp = cache_zap(ncp, 0);
f63911bf
MD
2993 if (ncp)
2994 _cache_drop(ncp);
67773eb3 2995 } else {
28623bf9 2996 _cache_drop(ncp);
67773eb3 2997 }
7ea21ed1 2998 ncp = nnp;
984263bc 2999 }
f63911bf 3000 spin_unlock_wr(&nchpp->spin); XXX
984263bc
MD
3001 }
3002}
3003
28623bf9
MD
3004#endif
3005
984263bc 3006static int disablecwd;
0c52fa62
SG
3007SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
3008 "Disable getcwd");
984263bc 3009
093e85dc
SG
3010static u_long numcwdcalls;
3011SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
3012 "Number of current directory resolution calls");
3013static u_long numcwdfailnf;
3014SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
3015 "Number of current directory failures due to lack of file");
3016static u_long numcwdfailsz;
3017SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
3018 "Number of current directory failures due to large result");
3019static u_long numcwdfound;
3020SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
3021 "Number of current directory resolution successes");
41c20dac 3022
3919ced0
MD
3023/*
3024 * MPALMOSTSAFE
3025 */
984263bc 3026int
753fd850 3027sys___getcwd(struct __getcwd_args *uap)
984263bc 3028{
2d63aca1 3029 u_int buflen;
63f58b90 3030 int error;
02680f1b
MD
3031 char *buf;
3032 char *bp;
3033
3034 if (disablecwd)
3035 return (ENODEV);
3036
3037 buflen = uap->buflen;
2ce1f68b 3038 if (buflen == 0)
02680f1b
MD
3039 return (EINVAL);
3040 if (buflen > MAXPATHLEN)
3041 buflen = MAXPATHLEN;
63f58b90 3042
efda3bd0 3043 buf = kmalloc(buflen, M_TEMP, M_WAITOK);
3919ced0 3044 get_mplock();
02680f1b 3045 bp = kern_getcwd(buf, buflen, &error);
3919ced0 3046 rel_mplock();
63f58b90 3047 if (error == 0)
02680f1b 3048 error = copyout(bp, uap->buf, strlen(bp) + 1);
efda3bd0 3049 kfree(buf, M_TEMP);
63f58b90
EN
3050 return (error);
3051}
3052
02680f1b
MD
3053char *
3054kern_getcwd(char *buf, size_t buflen, int *error)
63f58b90 3055{
41c20dac 3056 struct proc *p = curproc;
63f58b90 3057 char *bp;
02680f1b 3058 int i, slash_prefixed;
984263bc 3059 struct filedesc *fdp;
28623bf9 3060 struct nchandle nch;
2247fe02 3061 struct namecache *ncp;
984263bc
MD
3062
3063 numcwdcalls++;
63f58b90
EN
3064 bp = buf;
3065 bp += buflen - 1;
984263bc
MD
3066 *bp = '\0';
3067 fdp = p->p_fd;
3068 slash_prefixed = 0;
524c845c 3069
28623bf9 3070 nch = fdp->fd_ncdir;
2247fe02
MD
3071 ncp = nch.ncp;
3072 if (ncp)
3073 _cache_hold(ncp);
3074
3075 while (ncp && (ncp != fdp->fd_nrdir.ncp ||
28623bf9
MD
3076 nch.mount != fdp->fd_nrdir.mount)
3077 ) {
3078 /*
3079 * While traversing upwards if we encounter the root
3080 * of the current mount we have to skip to the mount point
3081 * in the underlying filesystem.
3082 */
2247fe02 3083 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
28623bf9 3084 nch = nch.mount->mnt_ncmounton;
2247fe02
MD
3085 _cache_drop(ncp);
3086 ncp = nch.ncp;
3087 if (ncp)
3088 _cache_hold(ncp);
984263bc
MD
3089 continue;
3090 }
28623bf9
MD
3091
3092 /*
3093 * Prepend the path segment
3094 */
2247fe02 3095 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
984263bc 3096 if (bp == buf) {
093e85dc 3097 numcwdfailsz++;
2ce1f68b 3098 *error = ERANGE;
2247fe02
MD
3099 bp = NULL;
3100 goto done;
984263bc 3101 }
2247fe02 3102 *--bp = ncp->nc_name[i];
984263bc
MD
3103 }
3104 if (bp == buf) {
093e85dc 3105 numcwdfailsz++;
2ce1f68b 3106 *error = ERANGE;
2247fe02
MD
3107 bp = NULL;
3108 goto done;
984263bc
MD
3109 }
3110 *--bp = '/';
3111 slash_prefixed = 1;
28623bf9
MD
3112
3113 /*
3114 * Go up a directory. This isn't a mount point so we don't
3115 * have to check again.
3116 */
2247fe02
MD
3117 while ((nch.ncp = ncp->nc_parent) != NULL) {
3118 _cache_lock(ncp);
3119 if (nch.ncp != ncp->nc_parent) {
3120 _cache_unlock(ncp);
3121 continue;
3122 }
3123 _cache_hold(nch.ncp);
3124 _cache_unlock(ncp);
3125 break;
3126 }
3127 _cache_drop(ncp);
3128 ncp = nch.ncp;
524c845c 3129 }
2247fe02 3130 if (ncp == NULL) {
093e85dc 3131 numcwdfailnf++;
524c845c 3132 *error = ENOENT;
2247fe02
MD
3133 bp = NULL;
3134 goto done;
984263bc
MD
3135 }
3136 if (!slash_prefixed) {
3137 if (bp == buf) {
093e85dc 3138 numcwdfailsz++;
2ce1f68b 3139 *error = ERANGE;
2247fe02
MD
3140 bp = NULL;
3141 goto done;
984263bc
MD
3142 }
3143 *--bp = '/';
3144 }
3145 numcwdfound++;
02680f1b 3146 *error = 0;
2247fe02
MD
3147done:
3148 if (ncp)
3149 _cache_drop(ncp);
02680f1b 3150 return (bp);
984263bc
MD
3151}
3152
3153/*
3154 * Thus begins the fullpath magic.
2247fe02
MD
3155 *
3156 * The passed nchp is referenced but not locked.
984263bc 3157 */
984263bc
MD
3158static int disablefullpath;
3159SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
0c52fa62
SG
3160 &disablefullpath, 0,
3161 "Disable fullpath lookups");
984263bc 3162
093e85dc
SG
3163static u_int numfullpathcalls;
3164SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD,
3165 &numfullpathcalls, 0,
3166 "Number of full path resolutions in progress");
3167static u_int numfullpathfailnf;
3168SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD,
3169 &numfullpathfailnf, 0,
3170 "Number of full path resolution failures due to lack of file");
3171static u_int numfullpathfailsz;
3172SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD,
3173 &numfullpathfailsz, 0,
3174 "Number of full path resolution failures due to insufficient memory");
3175static u_int numfullpathfound;
3176SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD,
3177 &numfullpathfound, 0,
3178 "Number of full path resolution successes");
984263bc
MD
3179
3180int
2247fe02 3181cache_fullpath(struct proc *p, struct nchandle *nchp,
5b4cfb7e 3182 char **retbuf, char **freebuf, int guess)
8987aad7 3183{
28623bf9
MD
3184 struct nchandle fd_nrdir;
3185 struct nchandle nch;
f63911bf 3186 struct namecache *ncp;
5b4cfb7e 3187 struct mount *mp, *new_mp;
f63911bf
MD
3188 char *bp, *buf;
3189 int slash_prefixed;
3190 int error = 0;
3191 int i;
984263bc 3192
f63911bf 3193 atomic_add_int(&numfullpathcalls, -1);
b310dfc4 3194
28623bf9
MD
3195 *retbuf = NULL;
3196 *freebuf = NULL;
3197
efda3bd0 3198 buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
984263bc
MD
3199 bp = buf + MAXPATHLEN - 1;
3200 *bp = '\0';
75ffff0d
JS
3201 if (p != NULL)
3202 fd_nrdir = p->p_fd->fd_nrdir;
3203 else
28623bf9 3204 fd_nrdir = rootnch;
984263bc 3205 slash_prefixed = 0;
2247fe02 3206 nch = *nchp;
f63911bf 3207 ncp = nch.ncp;
2247fe02
MD
3208 if (ncp)
3209 _cache_hold(ncp);
f63911bf 3210 mp = nch.mount;
28623bf9 3211
f63911bf 3212 while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
5b4cfb7e
AH
3213 new_mp = NULL;
3214
3215 /*
3216 * If we are asked to guess the upwards path, we do so whenever
3217 * we encounter an ncp marked as a mountpoint. We try to find
3218 * the actual mountpoint by finding the mountpoint with this ncp.
3219 */
3220 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
3221 new_mp = mount_get_by_nc(ncp);
3222 }
28623bf9
MD
3223 /*
3224 * While traversing upwards if we encounter the root
3225 * of the current mount we have to skip to the mount point.
3226 */
f63911bf 3227 if (ncp == mp->mnt_ncmountpt.ncp) {
5b4cfb7e
AH
3228 new_mp = mp;
3229 }
3230 if (new_mp) {
5b4cfb7e 3231 nch = new_mp->mnt_ncmounton;
2247fe02 3232 _cache_drop(ncp);
f63911bf 3233 ncp = nch.ncp;
2247fe02
MD
3234 if (ncp)
3235 _cache_hold(ncp);
f63911bf 3236 mp = nch.mount;
984263bc
MD
3237 continue;
3238 }
28623bf9
MD
3239
3240 /*
3241 * Prepend the path segment
3242 */
2247fe02 3243 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
984263bc 3244 if (bp == buf) {
093e85dc 3245 numfullpathfailsz++;
efda3bd0 3246 kfree(buf, M_TEMP);
f63911bf
MD
3247 error = ENOMEM;
3248 goto done;
984263bc 3249 }
2247fe02 3250 *--bp = ncp->nc_name[i];
984263bc
MD
3251 }
3252 if (bp == buf) {
093e85dc 3253 numfullpathfailsz++;
efda3bd0 3254 kfree(buf, M_TEMP);
f63911bf
MD
3255 error = ENOMEM;
3256 goto done;
984263bc
MD
3257 }
3258 *--bp = '/';
3259 slash_prefixed = 1;
28623bf9
MD
3260
3261 /*
3262 * Go up a directory. This isn't a mount point so we don't
3263 * have to check again.
f63911bf 3264 *
2247fe02 3265 * We can only safely access nc_parent with ncp held locked.
28623bf9 3266 */
2247fe02
MD
3267 while ((nch.ncp = ncp->nc_parent) != NULL) {
3268 _cache_lock(ncp);
3269 if (nch.ncp != ncp->nc_parent) {
3270 _cache_unlock(ncp);
3271 continue;
3272 }
f63911bf 3273 _cache_hold(nch.ncp);
2247fe02
MD
3274 _cache_unlock(ncp);
3275 break;
3276 }
f63911bf
MD
3277 _cache_drop(ncp);
3278 ncp = nch.ncp;
524c845c 3279 }
2247fe02 3280 if (ncp == NULL) {
093e85dc 3281 numfullpathfailnf++;
efda3bd0 3282 kfree(buf, M_TEMP);
f63911bf
MD
3283 error = ENOENT;
3284 goto done;
984263bc 3285 }
28623bf9 3286
984263bc
MD
3287 if (!slash_prefixed) {
3288 if (bp == buf) {
093e85dc 3289 numfullpathfailsz++;
efda3bd0 3290 kfree(buf, M_TEMP);
f63911bf
MD
3291 error = ENOMEM;
3292 goto done;
984263bc
MD
3293 }
3294 *--bp = '/';
3295 }
3296 numfullpathfound++;
3297 *retbuf = bp;
b310dfc4 3298 *freebuf = buf;
f63911bf
MD
3299 error = 0;
3300done:
2247fe02
MD
3301 if (ncp)
3302 _cache_drop(ncp);
f63911bf 3303 return(error);
984263bc 3304}
8987aad7 3305
b6372d22 3306int
5b4cfb7e 3307vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, char **freebuf, int guess)
b6372d22 3308{
b6372d22 3309 struct namecache *ncp;
28623bf9 3310 struct nchandle nch;
f63911bf 3311 int error;
b6372d22 3312
f63911bf 3313 atomic_add_int(&numfullpathcalls, 1);
b6372d22
JS
3314 if (disablefullpath)
3315 return (ENODEV);
3316
3317 if (p == NULL)
3318 return (EINVAL);
3319
3320 /* vn is NULL, client wants us to use p->p_textvp */
3321 if (vn == NULL) {
3322 if ((vn = p->p_textvp) == NULL)
3323 return (EINVAL);
3324 }
287a8577 3325 spin_lock(&vn->v_spinlock);
b6372d22
JS
3326 TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
3327 if (ncp->nc_nlen)
3328 break;
3329 }
f63911bf 3330 if (ncp == NULL) {
287a8577 3331 spin_unlock(&vn->v_spinlock);
b6372d22 3332 return (EINVAL);
f63911bf
MD
3333 }
3334 _cache_hold(ncp);
287a8577 3335 spin_unlock(&vn->v_spinlock);
b6372d22 3336
f63911bf 3337 atomic_add_int(&numfullpathcalls, -1);
28623bf9
MD
3338 nch.ncp = ncp;;
3339 nch.mount = vn->v_mount;
5b4cfb7e 3340 error = cache_fullpath(p, &nch, retbuf, freebuf, guess);
f63911bf
MD
3341 _cache_drop(ncp);
3342 return (error);
b6372d22 3343}