kernel - Remove vfs.cache.numfullpath* sysctl statistics
[dragonfly.git] / sys / kern / vfs_cache.c
CommitLineData
984263bc 1/*
2247fe02 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved.
8c10bfcf
MD
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
984263bc
MD
34 * Copyright (c) 1989, 1993, 1995
35 * The Regents of the University of California. All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * Poul-Henning Kamp of the FreeBSD Project.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
dc71b7ab 48 * 3. Neither the name of the University nor the names of its contributors
984263bc
MD
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
984263bc
MD
63 */
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
68#include <sys/sysctl.h>
69#include <sys/mount.h>
70#include <sys/vnode.h>
984263bc
MD
71#include <sys/malloc.h>
72#include <sys/sysproto.h>
f63911bf 73#include <sys/spinlock.h>
984263bc 74#include <sys/proc.h>
dadab5e9 75#include <sys/namei.h>
690a3127 76#include <sys/nlookup.h>
984263bc
MD
77#include <sys/filedesc.h>
78#include <sys/fnv_hash.h>
24e51f36 79#include <sys/globaldata.h>
63f58b90 80#include <sys/kern_syscall.h>
fad57d0e 81#include <sys/dirent.h>
8c361dda 82#include <ddb/ddb.h>
984263bc 83
f63911bf 84#include <sys/spinlock2.h>
3c37c940 85
bf40a153
MD
86#define MAX_RECURSION_DEPTH 64
87
984263bc 88/*
7ea21ed1 89 * Random lookups in the cache are accomplished with a hash table using
2247fe02
MD
90 * a hash key of (nc_src_vp, name). Each hash chain has its own spin lock.
91 *
92 * Negative entries may exist and correspond to resolved namecache
93 * structures where nc_vp is NULL. In a negative entry, NCF_WHITEOUT
94 * will be set if the entry corresponds to a whited-out directory entry
bf3f67a7
MD
95 * (verses simply not finding the entry at all). pcpu_ncache[n].neg_list
96 * is locked via pcpu_ncache[n].neg_spin;
2247fe02
MD
97 *
98 * MPSAFE RULES:
99 *
100 * (1) A ncp must be referenced before it can be locked.
101 *
102 * (2) A ncp must be locked in order to modify it.
103 *
104 * (3) ncp locks are always ordered child -> parent. That may seem
105 * backwards but forward scans use the hash table and thus can hold
106 * the parent unlocked when traversing downward.
984263bc 107 *
2247fe02
MD
108 * This allows insert/rename/delete/dot-dot and other operations
109 * to use ncp->nc_parent links.
984263bc 110 *
2247fe02
MD
111 * This also prevents a locked up e.g. NFS node from creating a
112 * chain reaction all the way back to the root vnode / namecache.
113 *
114 * (4) parent linkages require both the parent and child to be locked.
984263bc
MD
115 */
116
117/*
118 * Structures associated with name cacheing.
119 */
07baed26
MD
120#define NCHHASH(hash) (&nchashtbl[(hash) & nchash])
121#define MINNEG 1024
122#define MINPOS 1024
a458ee25 123#define NCMOUNT_NUMCACHE 16301 /* prime number */
8987aad7 124
24e51f36
HP
125MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
126
f63911bf
MD
127LIST_HEAD(nchash_list, namecache);
128
3536c341
MD
129/*
130 * Don't cachealign, but at least pad to 32 bytes so entries
131 * don't cross a cache line.
132 */
f63911bf 133struct nchash_head {
3536c341
MD
134 struct nchash_list list; /* 16 bytes */
135 struct spinlock spin; /* 8 bytes */
136 long pad01; /* 8 bytes */
f63911bf
MD
137};
138
07baed26
MD
139struct ncmount_cache {
140 struct spinlock spin;
141 struct namecache *ncp;
142 struct mount *mp;
9c105d5b 143 int isneg; /* if != 0 mp is originator and not target */
a458ee25 144} __cachealign;
07baed26 145
bf3f67a7
MD
146struct pcpu_ncache {
147 struct spinlock neg_spin; /* for neg_list and neg_count */
148 struct namecache_list neg_list;
149 long neg_count;
150 long vfscache_negs;
151 long vfscache_count;
152 long vfscache_leafs;
c47bb683
MD
153} __cachealign;
154
f63911bf 155static struct nchash_head *nchashtbl;
bf3f67a7 156static struct pcpu_ncache *pcpu_ncache;
07baed26 157static struct ncmount_cache ncmount_cache[NCMOUNT_NUMCACHE];
8987aad7 158
fad57d0e
MD
159/*
160 * ncvp_debug - debug cache_fromvp(). This is used by the NFS server
161 * to create the namecache infrastructure leading to a dangling vnode.
162 *
163 * 0 Only errors are reported
164 * 1 Successes are reported
165 * 2 Successes + the whole directory scan is reported
166 * 3 Force the directory scan code run as if the parent vnode did not
167 * have a namecache record, even if it does have one.
168 */
169static int ncvp_debug;
0c52fa62
SG
170SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
171 "Namecache debug level (0-3)");
fad57d0e 172
984263bc 173static u_long nchash; /* size of hash table */
0c52fa62
SG
174SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
175 "Size of namecache hash table");
8987aad7 176
6363f268
MD
177static int ncnegflush = 10; /* burst for negative flush */
178SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
179 "Batch flush negative entries");
180
181static int ncposflush = 10; /* burst for positive flush */
182SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
183 "Batch flush positive entries");
184
f63911bf 185static int ncnegfactor = 16; /* ratio of negative entries */
0c52fa62
SG
186SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
187 "Ratio of namecache negative entries");
8987aad7 188
fc21741a 189static int nclockwarn; /* warn on locked entries in ticks */
0c52fa62
SG
190SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
191 "Warn on locked namecache entries in ticks");
fc21741a 192
65870584 193static int numdefered; /* number of cache entries allocated */
0c52fa62
SG
194SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
195 "Number of cache entries allocated");
65870584 196
9e10d70b 197static int ncposlimit; /* number of cache entries allocated */
0c52fa62
SG
198SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
199 "Number of cache entries allocated");
9e10d70b 200
0bf2d8ee 201static int ncp_shared_lock_disable = 0;
79fd1696
MD
202SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
203 &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
204
0c52fa62
SG
205SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
206 "sizeof(struct vnode)");
207SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
208 "sizeof(struct namecache)");
984263bc 209
7e9c94bd
MD
210static int ncmount_cache_enable = 1;
211SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
212 &ncmount_cache_enable, 0, "mount point cache");
7e9c94bd 213
3536c341 214static __inline void _cache_drop(struct namecache *ncp);
28623bf9 215static int cache_resolve_mp(struct mount *mp);
5312fa43 216static struct vnode *cache_dvpref(struct namecache *ncp);
28623bf9
MD
217static void _cache_lock(struct namecache *ncp);
218static void _cache_setunresolved(struct namecache *ncp);
bf3f67a7
MD
219static void _cache_cleanneg(long count);
220static void _cache_cleanpos(long count);
65870584 221static void _cache_cleandefered(void);
8d09ad3d 222static void _cache_unlink(struct namecache *ncp);
bf3f67a7
MD
223#if 0
224static void vfscache_rollup_all(void);
225#endif
646a1cda 226
984263bc
MD
227/*
228 * The new name cache statistics
229 */
230SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
bf3f67a7
MD
231static long vfscache_negs;
232SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0,
093e85dc 233 "Number of negative namecache entries");
bf3f67a7
MD
234static long vfscache_count;
235SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0,
236 "Number of namecaches entries");
237static long vfscache_leafs;
238SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0,
093e85dc 239 "Number of namecaches entries");
984263bc 240
24e51f36
HP
241struct nchstats nchstats[SMP_MAXCPU];
242/*
243 * Export VFS cache effectiveness statistics to user-land.
244 *
245 * The statistics are left for aggregation to user-land so
246 * neat things can be achieved, like observing per-CPU cache
247 * distribution.
248 */
249static int
3736bb9b 250sysctl_nchstats(SYSCTL_HANDLER_ARGS)
24e51f36
HP
251{
252 struct globaldata *gd;
253 int i, error;
254
255 error = 0;
256 for (i = 0; i < ncpus; ++i) {
257 gd = globaldata_find(i);
258 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
259 sizeof(struct nchstats))))
260 break;
261 }
984263bc 262
24e51f36
HP
263 return (error);
264}
265SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
3736bb9b 266 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
984263bc 267
65870584 268static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
f63911bf 269
3536c341
MD
270/*
271 * Cache mount points and namecache records in order to avoid unnecessary
272 * atomic ops on mnt_refs and ncp->refs. This improves concurrent SMP
273 * performance and is particularly important on multi-socket systems to
274 * reduce cache-line ping-ponging.
275 *
276 * Try to keep the pcpu structure within one cache line (~64 bytes).
277 */
278#define MNTCACHE_COUNT 5
279
280struct mntcache {
281 struct mount *mntary[MNTCACHE_COUNT];
282 struct namecache *ncp1;
283 struct namecache *ncp2;
8edfbc5e 284 struct nchandle ncdir;
3536c341
MD
285 int iter;
286 int unused01;
287} __cachealign;
288
289static struct mntcache pcpu_mntcache[MAXCPU];
290
291static
292void
293_cache_mntref(struct mount *mp)
294{
295 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
296 int i;
297
298 for (i = 0; i < MNTCACHE_COUNT; ++i) {
299 if (cache->mntary[i] != mp)
300 continue;
301 if (atomic_cmpset_ptr((void *)&cache->mntary[i], mp, NULL))
302 return;
303 }
304 atomic_add_int(&mp->mnt_refs, 1);
305}
306
307static
308void
309_cache_mntrel(struct mount *mp)
310{
311 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
312 int i;
313
314 for (i = 0; i < MNTCACHE_COUNT; ++i) {
315 if (cache->mntary[i] == NULL) {
316 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp);
317 if (mp == NULL)
318 return;
319 }
320 }
321 i = (int)((uint32_t)++cache->iter % (uint32_t)MNTCACHE_COUNT);
322 mp = atomic_swap_ptr((void *)&cache->mntary[i], mp);
323 if (mp)
324 atomic_add_int(&mp->mnt_refs, -1);
325}
326
327/*
328 * Clears all cached mount points on all cpus. This routine should only
329 * be called when we are waiting for a mount to clear, e.g. so we can
330 * unmount.
331 */
332void
333cache_clearmntcache(void)
334{
335 int n;
336
337 for (n = 0; n < ncpus; ++n) {
338 struct mntcache *cache = &pcpu_mntcache[n];
339 struct namecache *ncp;
340 struct mount *mp;
341 int i;
342
343 for (i = 0; i < MNTCACHE_COUNT; ++i) {
344 if (cache->mntary[i]) {
345 mp = atomic_swap_ptr(
346 (void *)&cache->mntary[i], NULL);
347 if (mp)
348 atomic_add_int(&mp->mnt_refs, -1);
349 }
350 }
351 if (cache->ncp1) {
352 ncp = atomic_swap_ptr((void *)&cache->ncp1, NULL);
353 if (ncp)
354 _cache_drop(ncp);
355 }
356 if (cache->ncp2) {
357 ncp = atomic_swap_ptr((void *)&cache->ncp2, NULL);
358 if (ncp)
359 _cache_drop(ncp);
360 }
8edfbc5e
MD
361 if (cache->ncdir.ncp) {
362 ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, NULL);
363 if (ncp)
364 _cache_drop(ncp);
365 }
366 if (cache->ncdir.mount) {
367 mp = atomic_swap_ptr((void *)&cache->ncdir.mount, NULL);
368 if (mp)
369 atomic_add_int(&mp->mnt_refs, -1);
370 }
3536c341
MD
371 }
372}
373
374
f63911bf
MD
375/*
376 * Namespace locking. The caller must already hold a reference to the
377 * namecache structure in order to lock/unlock it. This function prevents
378 * the namespace from being created or destroyed by accessors other then
379 * the lock holder.
380 *
381 * Note that holding a locked namecache structure prevents other threads
382 * from making namespace changes (e.g. deleting or creating), prevents
383 * vnode association state changes by other threads, and prevents the
384 * namecache entry from being resolved or unresolved by other threads.
385 *
79fd1696
MD
386 * An exclusive lock owner has full authority to associate/disassociate
387 * vnodes and resolve/unresolve the locked ncp.
388 *
389 * A shared lock owner only has authority to acquire the underlying vnode,
390 * if any.
f63911bf 391 *
79fd1696 392 * The primary lock field is nc_lockstatus. nc_locktd is set after the
2247fe02
MD
393 * fact (when locking) or cleared prior to unlocking.
394 *
f63911bf
MD
395 * WARNING! Holding a locked ncp will prevent a vnode from being destroyed
396 * or recycled, but it does NOT help you if the vnode had already
397 * initiated a recyclement. If this is important, use cache_get()
398 * rather then cache_lock() (and deal with the differences in the
399 * way the refs counter is handled). Or, alternatively, make an
400 * unconditional call to cache_validate() or cache_resolve()
401 * after cache_lock() returns.
402 */
403static
404void
405_cache_lock(struct namecache *ncp)
406{
407 thread_t td;
f63911bf 408 int didwarn;
e8bc8439 409 int begticks;
f63911bf 410 int error;
2247fe02 411 u_int count;
f63911bf
MD
412
413 KKASSERT(ncp->nc_refs != 0);
414 didwarn = 0;
e8bc8439 415 begticks = 0;
f63911bf
MD
416 td = curthread;
417
418 for (;;) {
79fd1696
MD
419 count = ncp->nc_lockstatus;
420 cpu_ccfence();
f63911bf 421
79fd1696
MD
422 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
423 if (atomic_cmpset_int(&ncp->nc_lockstatus,
424 count, count + 1)) {
f63911bf
MD
425 /*
426 * The vp associated with a locked ncp must
427 * be held to prevent it from being recycled.
428 *
429 * WARNING! If VRECLAIMED is set the vnode
430 * could already be in the middle of a recycle.
431 * Callers must use cache_vref() or
432 * cache_vget() on the locked ncp to
433 * validate the vp or set the cache entry
434 * to unresolved.
2247fe02
MD
435 *
436 * NOTE! vhold() is allowed if we hold a
437 * lock on the ncp (which we do).
f63911bf 438 */
2247fe02 439 ncp->nc_locktd = td;
f63911bf 440 if (ncp->nc_vp)
79fd1696 441 vhold(ncp->nc_vp);
f63911bf
MD
442 break;
443 }
2247fe02
MD
444 /* cmpset failed */
445 continue;
446 }
447 if (ncp->nc_locktd == td) {
79fd1696
MD
448 KKASSERT((count & NC_SHLOCK_FLAG) == 0);
449 if (atomic_cmpset_int(&ncp->nc_lockstatus,
450 count, count + 1)) {
2247fe02
MD
451 break;
452 }
453 /* cmpset failed */
f63911bf
MD
454 continue;
455 }
79fd1696
MD
456 tsleep_interlock(&ncp->nc_locktd, 0);
457 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
2247fe02
MD
458 count | NC_EXLOCK_REQ) == 0) {
459 /* cmpset failed */
f63911bf 460 continue;
2247fe02 461 }
e8bc8439
MD
462 if (begticks == 0)
463 begticks = ticks;
79fd1696
MD
464 error = tsleep(&ncp->nc_locktd, PINTERLOCKED,
465 "clock", nclockwarn);
f63911bf 466 if (error == EWOULDBLOCK) {
2247fe02
MD
467 if (didwarn == 0) {
468 didwarn = ticks;
79fd1696 469 kprintf("[diagnostic] cache_lock: "
d1083e0d
MD
470 "%s blocked on %p %08x",
471 td->td_comm, ncp, count);
2247fe02
MD
472 kprintf(" \"%*.*s\"\n",
473 ncp->nc_nlen, ncp->nc_nlen,
474 ncp->nc_name);
475 }
f63911bf 476 }
79fd1696 477 /* loop */
f63911bf 478 }
2247fe02 479 if (didwarn) {
d1083e0d 480 kprintf("[diagnostic] cache_lock: %s unblocked %*.*s after "
2247fe02 481 "%d secs\n",
d1083e0d 482 td->td_comm,
2247fe02 483 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
e8bc8439 484 (int)(ticks + (hz / 2) - begticks) / hz);
f63911bf
MD
485 }
486}
487
79fd1696
MD
488/*
489 * The shared lock works similarly to the exclusive lock except
490 * nc_locktd is left NULL and we need an interlock (VHOLD) to
491 * prevent vhold() races, since the moment our cmpset_int succeeds
492 * another cpu can come in and get its own shared lock.
493 *
494 * A critical section is needed to prevent interruption during the
495 * VHOLD interlock.
496 */
497static
498void
499_cache_lock_shared(struct namecache *ncp)
500{
501 int didwarn;
502 int error;
503 u_int count;
e8bc8439 504 u_int optreq = NC_EXLOCK_REQ;
79fd1696
MD
505
506 KKASSERT(ncp->nc_refs != 0);
507 didwarn = 0;
508
509 for (;;) {
510 count = ncp->nc_lockstatus;
511 cpu_ccfence();
512
513 if ((count & ~NC_SHLOCK_REQ) == 0) {
514 crit_enter();
515 if (atomic_cmpset_int(&ncp->nc_lockstatus,
516 count,
517 (count + 1) | NC_SHLOCK_FLAG |
518 NC_SHLOCK_VHOLD)) {
519 /*
520 * The vp associated with a locked ncp must
521 * be held to prevent it from being recycled.
522 *
523 * WARNING! If VRECLAIMED is set the vnode
524 * could already be in the middle of a recycle.
525 * Callers must use cache_vref() or
526 * cache_vget() on the locked ncp to
527 * validate the vp or set the cache entry
528 * to unresolved.
529 *
530 * NOTE! vhold() is allowed if we hold a
531 * lock on the ncp (which we do).
532 */
533 if (ncp->nc_vp)
534 vhold(ncp->nc_vp);
535 atomic_clear_int(&ncp->nc_lockstatus,
536 NC_SHLOCK_VHOLD);
537 crit_exit();
538 break;
539 }
540 /* cmpset failed */
541 crit_exit();
542 continue;
543 }
544
545 /*
546 * If already held shared we can just bump the count, but
547 * only allow this if nobody is trying to get the lock
e8bc8439
MD
548 * exclusively. If we are blocking too long ignore excl
549 * requests (which can race/deadlock us).
79fd1696
MD
550 *
551 * VHOLD is a bit of a hack. Even though we successfully
552 * added another shared ref, the cpu that got the first
553 * shared ref might not yet have held the vnode.
554 */
e8bc8439 555 if ((count & (optreq|NC_SHLOCK_FLAG)) == NC_SHLOCK_FLAG) {
79fd1696
MD
556 KKASSERT((count & ~(NC_EXLOCK_REQ |
557 NC_SHLOCK_REQ |
558 NC_SHLOCK_FLAG)) > 0);
559 if (atomic_cmpset_int(&ncp->nc_lockstatus,
560 count, count + 1)) {
561 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
562 cpu_pause();
563 break;
564 }
565 continue;
566 }
567 tsleep_interlock(ncp, 0);
568 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
569 count | NC_SHLOCK_REQ) == 0) {
570 /* cmpset failed */
571 continue;
572 }
573 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn);
574 if (error == EWOULDBLOCK) {
e8bc8439 575 optreq = 0;
79fd1696 576 if (didwarn == 0) {
d1083e0d 577 didwarn = ticks - nclockwarn;
79fd1696 578 kprintf("[diagnostic] cache_lock_shared: "
d1083e0d
MD
579 "%s blocked on %p %08x",
580 curthread->td_comm, ncp, count);
79fd1696
MD
581 kprintf(" \"%*.*s\"\n",
582 ncp->nc_nlen, ncp->nc_nlen,
583 ncp->nc_name);
584 }
585 }
586 /* loop */
587 }
588 if (didwarn) {
589 kprintf("[diagnostic] cache_lock_shared: "
d1083e0d
MD
590 "%s unblocked %*.*s after %d secs\n",
591 curthread->td_comm,
79fd1696
MD
592 ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
593 (int)(ticks - didwarn) / hz);
594 }
595}
596
2247fe02 597/*
f0181d63
MD
598 * Lock ncp exclusively, return 0 on success.
599 *
65870584
MD
600 * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
601 * such as the case where one of its children is locked.
2247fe02 602 */
f63911bf
MD
603static
604int
605_cache_lock_nonblock(struct namecache *ncp)
606{
607 thread_t td;
2247fe02 608 u_int count;
f63911bf 609
f63911bf
MD
610 td = curthread;
611
612 for (;;) {
79fd1696 613 count = ncp->nc_lockstatus;
f63911bf 614
79fd1696
MD
615 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
616 if (atomic_cmpset_int(&ncp->nc_lockstatus,
617 count, count + 1)) {
f63911bf
MD
618 /*
619 * The vp associated with a locked ncp must
620 * be held to prevent it from being recycled.
621 *
622 * WARNING! If VRECLAIMED is set the vnode
623 * could already be in the middle of a recycle.
624 * Callers must use cache_vref() or
625 * cache_vget() on the locked ncp to
626 * validate the vp or set the cache entry
627 * to unresolved.
2247fe02
MD
628 *
629 * NOTE! vhold() is allowed if we hold a
630 * lock on the ncp (which we do).
f63911bf 631 */
2247fe02 632 ncp->nc_locktd = td;
f63911bf 633 if (ncp->nc_vp)
79fd1696 634 vhold(ncp->nc_vp);
f63911bf
MD
635 break;
636 }
2247fe02
MD
637 /* cmpset failed */
638 continue;
639 }
640 if (ncp->nc_locktd == td) {
79fd1696
MD
641 if (atomic_cmpset_int(&ncp->nc_lockstatus,
642 count, count + 1)) {
2247fe02
MD
643 break;
644 }
645 /* cmpset failed */
f63911bf
MD
646 continue;
647 }
648 return(EWOULDBLOCK);
649 }
650 return(0);
651}
652
79fd1696
MD
653/*
654 * The shared lock works similarly to the exclusive lock except
655 * nc_locktd is left NULL and we need an interlock (VHOLD) to
656 * prevent vhold() races, since the moment our cmpset_int succeeds
657 * another cpu can come in and get its own shared lock.
658 *
659 * A critical section is needed to prevent interruption during the
660 * VHOLD interlock.
661 */
662static
663int
664_cache_lock_shared_nonblock(struct namecache *ncp)
665{
666 u_int count;
667
668 for (;;) {
669 count = ncp->nc_lockstatus;
670
671 if ((count & ~NC_SHLOCK_REQ) == 0) {
672 crit_enter();
673 if (atomic_cmpset_int(&ncp->nc_lockstatus,
674 count,
675 (count + 1) | NC_SHLOCK_FLAG |
676 NC_SHLOCK_VHOLD)) {
677 /*
678 * The vp associated with a locked ncp must
679 * be held to prevent it from being recycled.
680 *
681 * WARNING! If VRECLAIMED is set the vnode
682 * could already be in the middle of a recycle.
683 * Callers must use cache_vref() or
684 * cache_vget() on the locked ncp to
685 * validate the vp or set the cache entry
686 * to unresolved.
687 *
688 * NOTE! vhold() is allowed if we hold a
689 * lock on the ncp (which we do).
690 */
691 if (ncp->nc_vp)
692 vhold(ncp->nc_vp);
693 atomic_clear_int(&ncp->nc_lockstatus,
694 NC_SHLOCK_VHOLD);
695 crit_exit();
696 break;
697 }
698 /* cmpset failed */
699 crit_exit();
700 continue;
701 }
702
703 /*
704 * If already held shared we can just bump the count, but
705 * only allow this if nobody is trying to get the lock
706 * exclusively.
707 *
708 * VHOLD is a bit of a hack. Even though we successfully
709 * added another shared ref, the cpu that got the first
710 * shared ref might not yet have held the vnode.
711 */
712 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) ==
713 NC_SHLOCK_FLAG) {
714 KKASSERT((count & ~(NC_EXLOCK_REQ |
715 NC_SHLOCK_REQ |
716 NC_SHLOCK_FLAG)) > 0);
717 if (atomic_cmpset_int(&ncp->nc_lockstatus,
718 count, count + 1)) {
719 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
720 cpu_pause();
721 break;
722 }
723 continue;
724 }
725 return(EWOULDBLOCK);
726 }
727 return(0);
728}
729
f63911bf
MD
730/*
731 * Helper function
732 *
733 * NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
2247fe02 734 *
79fd1696 735 * nc_locktd must be NULLed out prior to nc_lockstatus getting cleared.
f63911bf
MD
736 */
737static
738void
739_cache_unlock(struct namecache *ncp)
740{
741 thread_t td __debugvar = curthread;
2247fe02 742 u_int count;
79fd1696
MD
743 u_int ncount;
744 struct vnode *dropvp;
f63911bf
MD
745
746 KKASSERT(ncp->nc_refs >= 0);
79fd1696
MD
747 KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0);
748 KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td);
749
750 count = ncp->nc_lockstatus;
751 cpu_ccfence();
f63911bf 752
79fd1696
MD
753 /*
754 * Clear nc_locktd prior to the atomic op (excl lock only)
755 */
756 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1)
2247fe02 757 ncp->nc_locktd = NULL;
79fd1696
MD
758 dropvp = NULL;
759
2247fe02 760 for (;;) {
79fd1696
MD
761 if ((count &
762 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) {
763 dropvp = ncp->nc_vp;
764 if (count & NC_EXLOCK_REQ)
765 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */
766 else
767 ncount = 0;
768
769 if (atomic_cmpset_int(&ncp->nc_lockstatus,
770 count, ncount)) {
2247fe02 771 if (count & NC_EXLOCK_REQ)
79fd1696
MD
772 wakeup(&ncp->nc_locktd);
773 else if (count & NC_SHLOCK_REQ)
2247fe02
MD
774 wakeup(ncp);
775 break;
776 }
79fd1696 777 dropvp = NULL;
2247fe02 778 } else {
79fd1696
MD
779 KKASSERT((count & NC_SHLOCK_VHOLD) == 0);
780 KKASSERT((count & ~(NC_EXLOCK_REQ |
781 NC_SHLOCK_REQ |
782 NC_SHLOCK_FLAG)) > 1);
783 if (atomic_cmpset_int(&ncp->nc_lockstatus,
784 count, count - 1)) {
2247fe02
MD
785 break;
786 }
f63911bf 787 }
79fd1696
MD
788 count = ncp->nc_lockstatus;
789 cpu_ccfence();
f63911bf 790 }
79fd1696
MD
791
792 /*
793 * Don't actually drop the vp until we successfully clean out
794 * the lock, otherwise we may race another shared lock.
795 */
796 if (dropvp)
797 vdrop(dropvp);
f63911bf
MD
798}
799
79fd1696
MD
800static
801int
802_cache_lockstatus(struct namecache *ncp)
803{
804 if (ncp->nc_locktd == curthread)
805 return(LK_EXCLUSIVE);
806 if (ncp->nc_lockstatus & NC_SHLOCK_FLAG)
807 return(LK_SHARED);
808 return(-1);
809}
984263bc
MD
810
811/*
7ea21ed1
MD
812 * cache_hold() and cache_drop() prevent the premature deletion of a
813 * namecache entry but do not prevent operations (such as zapping) on
814 * that namecache entry.
5b287bba 815 *
36e90efd
MD
816 * This routine may only be called from outside this source module if
817 * nc_refs is already at least 1.
5b287bba 818 *
36e90efd
MD
819 * This is a rare case where callers are allowed to hold a spinlock,
820 * so we can't ourselves.
984263bc 821 */
7ea21ed1
MD
822static __inline
823struct namecache *
bc0c094e 824_cache_hold(struct namecache *ncp)
7ea21ed1 825{
5b287bba 826 atomic_add_int(&ncp->nc_refs, 1);
7ea21ed1
MD
827 return(ncp);
828}
829
8c361dda 830/*
f63911bf
MD
831 * Drop a cache entry, taking care to deal with races.
832 *
833 * For potential 1->0 transitions we must hold the ncp lock to safely
834 * test its flags. An unresolved entry with no children must be zapped
835 * to avoid leaks.
836 *
837 * The call to cache_zap() itself will handle all remaining races and
838 * will decrement the ncp's refs regardless. If we are resolved or
839 * have children nc_refs can safely be dropped to 0 without having to
840 * zap the entry.
841 *
842 * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
843 *
844 * NOTE: cache_zap() may return a non-NULL referenced parent which must
845 * be dropped in a loop.
8c361dda 846 */
7ea21ed1
MD
847static __inline
848void
bc0c094e 849_cache_drop(struct namecache *ncp)
7ea21ed1 850{
f63911bf
MD
851 int refs;
852
853 while (ncp) {
854 KKASSERT(ncp->nc_refs > 0);
855 refs = ncp->nc_refs;
856
857 if (refs == 1) {
858 if (_cache_lock_nonblock(ncp) == 0) {
055f5cc8 859 ncp->nc_flag &= ~NCF_DEFEREDZAP;
f63911bf
MD
860 if ((ncp->nc_flag & NCF_UNRESOLVED) &&
861 TAILQ_EMPTY(&ncp->nc_list)) {
65870584 862 ncp = cache_zap(ncp, 1);
f63911bf
MD
863 continue;
864 }
865 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
866 _cache_unlock(ncp);
867 break;
868 }
869 _cache_unlock(ncp);
870 }
871 } else {
872 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
873 break;
874 }
2247fe02 875 cpu_pause();
f517a1bb 876 }
7ea21ed1 877}
8987aad7 878
690a3127 879/*
2247fe02
MD
880 * Link a new namecache entry to its parent and to the hash table. Be
881 * careful to avoid races if vhold() blocks in the future.
882 *
883 * Both ncp and par must be referenced and locked.
884 *
53b4cfe1
MD
885 * NOTE: The hash table spinlock is held during this call, we can't do
886 * anything fancy.
690a3127
MD
887 */
888static void
2247fe02
MD
889_cache_link_parent(struct namecache *ncp, struct namecache *par,
890 struct nchash_head *nchpp)
690a3127 891{
bf3f67a7
MD
892 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
893
690a3127
MD
894 KKASSERT(ncp->nc_parent == NULL);
895 ncp->nc_parent = par;
2247fe02 896 ncp->nc_head = nchpp;
aabd5ce8
MD
897
898 /*
899 * Set inheritance flags. Note that the parent flags may be
900 * stale due to getattr potentially not having been run yet
901 * (it gets run during nlookup()'s).
902 */
903 ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
904 if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
905 ncp->nc_flag |= NCF_SF_PNOCACHE;
906 if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
907 ncp->nc_flag |= NCF_UF_PCACHE;
908
bf3f67a7
MD
909 /*
910 * Add to hash table and parent, adjust accounting
911 */
2247fe02 912 LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
bf3f67a7
MD
913 atomic_add_long(&pn->vfscache_count, 1);
914 if (TAILQ_EMPTY(&ncp->nc_list))
915 atomic_add_long(&pn->vfscache_leafs, 1);
2247fe02 916
690a3127
MD
917 if (TAILQ_EMPTY(&par->nc_list)) {
918 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
bf3f67a7 919 atomic_add_long(&pn->vfscache_leafs, -1);
21739618
MD
920 /*
921 * Any vp associated with an ncp which has children must
55361147 922 * be held to prevent it from being recycled.
21739618 923 */
690a3127 924 if (par->nc_vp)
2247fe02 925 vhold(par->nc_vp);
690a3127
MD
926 } else {
927 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
928 }
929}
930
931/*
2247fe02
MD
932 * Remove the parent and hash associations from a namecache structure.
933 * If this is the last child of the parent the cache_drop(par) will
934 * attempt to recursively zap the parent.
935 *
936 * ncp must be locked. This routine will acquire a temporary lock on
937 * the parent as wlel as the appropriate hash chain.
690a3127
MD
938 */
939static void
f63911bf 940_cache_unlink_parent(struct namecache *ncp)
690a3127 941{
bf3f67a7 942 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
690a3127 943 struct namecache *par;
f63911bf 944 struct vnode *dropvp;
690a3127
MD
945
946 if ((par = ncp->nc_parent) != NULL) {
2247fe02 947 KKASSERT(ncp->nc_parent == par);
f63911bf 948 _cache_hold(par);
2247fe02 949 _cache_lock(par);
287a8577 950 spin_lock(&ncp->nc_head->spin);
bf3f67a7
MD
951
952 /*
953 * Remove from hash table and parent, adjust accounting
954 */
2247fe02 955 LIST_REMOVE(ncp, nc_hash);
690a3127 956 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
bf3f67a7
MD
957 atomic_add_long(&pn->vfscache_count, -1);
958 if (TAILQ_EMPTY(&ncp->nc_list))
959 atomic_add_long(&pn->vfscache_leafs, -1);
960
f63911bf 961 dropvp = NULL;
bf3f67a7
MD
962 if (TAILQ_EMPTY(&par->nc_list)) {
963 atomic_add_long(&pn->vfscache_leafs, 1);
964 if (par->nc_vp)
965 dropvp = par->nc_vp;
966 }
287a8577 967 spin_unlock(&ncp->nc_head->spin);
2247fe02
MD
968 ncp->nc_parent = NULL;
969 ncp->nc_head = NULL;
970 _cache_unlock(par);
28623bf9 971 _cache_drop(par);
f63911bf
MD
972
973 /*
974 * We can only safely vdrop with no spinlocks held.
975 */
976 if (dropvp)
977 vdrop(dropvp);
690a3127
MD
978 }
979}
980
981/*
fad57d0e
MD
982 * Allocate a new namecache structure. Most of the code does not require
983 * zero-termination of the string but it makes vop_compat_ncreate() easier.
690a3127
MD
984 */
985static struct namecache *
524c845c 986cache_alloc(int nlen)
690a3127
MD
987{
988 struct namecache *ncp;
989
efda3bd0 990 ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
524c845c 991 if (nlen)
efda3bd0 992 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
524c845c 993 ncp->nc_nlen = nlen;
690a3127
MD
994 ncp->nc_flag = NCF_UNRESOLVED;
995 ncp->nc_error = ENOTCONN; /* needs to be resolved */
8c361dda 996 ncp->nc_refs = 1;
e4bff3c8 997
690a3127 998 TAILQ_INIT(&ncp->nc_list);
28623bf9 999 _cache_lock(ncp);
690a3127
MD
1000 return(ncp);
1001}
1002
f63911bf
MD
1003/*
1004 * Can only be called for the case where the ncp has never been
1005 * associated with anything (so no spinlocks are needed).
1006 */
8c361dda 1007static void
28623bf9 1008_cache_free(struct namecache *ncp)
8c361dda 1009{
79fd1696 1010 KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1);
8c361dda 1011 if (ncp->nc_name)
efda3bd0
MD
1012 kfree(ncp->nc_name, M_VFSCACHE);
1013 kfree(ncp, M_VFSCACHE);
8c361dda 1014}
690a3127 1015
2247fe02 1016/*
79fd1696 1017 * [re]initialize a nchandle.
2247fe02 1018 */
28623bf9
MD
1019void
1020cache_zero(struct nchandle *nch)
1021{
1022 nch->ncp = NULL;
1023 nch->mount = NULL;
1024}
1025
690a3127
MD
1026/*
1027 * Ref and deref a namecache structure.
5b287bba 1028 *
2247fe02
MD
1029 * The caller must specify a stable ncp pointer, typically meaning the
1030 * ncp is already referenced but this can also occur indirectly through
1031 * e.g. holding a lock on a direct child.
1032 *
1033 * WARNING: Caller may hold an unrelated read spinlock, which means we can't
1034 * use read spinlocks here.
690a3127 1035 */
28623bf9
MD
1036struct nchandle *
1037cache_hold(struct nchandle *nch)
bc0c094e 1038{
28623bf9 1039 _cache_hold(nch->ncp);
3536c341 1040 _cache_mntref(nch->mount);
28623bf9 1041 return(nch);
bc0c094e
MD
1042}
1043
61f96b6f
MD
1044/*
1045 * Create a copy of a namecache handle for an already-referenced
1046 * entry.
61f96b6f 1047 */
bc0c094e 1048void
28623bf9 1049cache_copy(struct nchandle *nch, struct nchandle *target)
bc0c094e 1050{
3536c341
MD
1051 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1052 struct namecache *ncp;
1053
28623bf9 1054 *target = *nch;
3536c341
MD
1055 _cache_mntref(target->mount);
1056 ncp = target->ncp;
1057 if (ncp) {
1058 if (ncp == cache->ncp1) {
1059 if (atomic_cmpset_ptr((void *)&cache->ncp1, ncp, NULL))
1060 return;
1061 }
1062 if (ncp == cache->ncp2) {
1063 if (atomic_cmpset_ptr((void *)&cache->ncp2, ncp, NULL))
1064 return;
1065 }
1066 _cache_hold(ncp);
1067 }
28623bf9
MD
1068}
1069
8edfbc5e
MD
1070/*
1071 * Caller wants to copy the current directory, copy it out from our
1072 * pcpu cache if possible (the entire critical path is just two localized
1073 * cmpset ops). If the pcpu cache has a snapshot at all it will be a
1074 * valid one, so we don't have to lock p->p_fd even though we are loading
1075 * two fields.
1076 *
1077 * This has a limited effect since nlookup must still ref and shlock the
1078 * vnode to check perms. We do avoid the per-proc spin-lock though, which
1079 * can aid threaded programs.
1080 */
1081void
1082cache_copy_ncdir(struct proc *p, struct nchandle *target)
1083{
1084 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1085
1086 *target = p->p_fd->fd_ncdir;
1087 if (target->ncp == cache->ncdir.ncp &&
1088 target->mount == cache->ncdir.mount) {
1089 if (atomic_cmpset_ptr((void *)&cache->ncdir.ncp,
1090 target->ncp, NULL)) {
1091 if (atomic_cmpset_ptr((void *)&cache->ncdir.mount,
1092 target->mount, NULL)) {
1093 /* CRITICAL PATH */
1094 return;
1095 }
1096 _cache_drop(target->ncp);
1097 }
1098 }
1099 spin_lock_shared(&p->p_fd->fd_spin);
1100 cache_copy(&p->p_fd->fd_ncdir, target);
1101 spin_unlock_shared(&p->p_fd->fd_spin);
1102}
1103
28623bf9
MD
1104void
1105cache_changemount(struct nchandle *nch, struct mount *mp)
1106{
3536c341
MD
1107 _cache_mntref(mp);
1108 _cache_mntrel(nch->mount);
28623bf9 1109 nch->mount = mp;
28623bf9
MD
1110}
1111
1112void
1113cache_drop(struct nchandle *nch)
1114{
3536c341 1115 _cache_mntrel(nch->mount);
28623bf9
MD
1116 _cache_drop(nch->ncp);
1117 nch->ncp = NULL;
1118 nch->mount = NULL;
bc0c094e
MD
1119}
1120
3536c341
MD
1121/*
1122 * Drop the nchandle, but try to cache the ref to avoid global atomic
1123 * ops. This is typically done on the system root and jail root nchandles.
1124 */
1125void
1126cache_drop_and_cache(struct nchandle *nch)
1127{
1128 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1129 struct namecache *ncp;
1130
1131 _cache_mntrel(nch->mount);
1132 ncp = nch->ncp;
1133 if (cache->ncp1 == NULL) {
1134 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp);
1135 if (ncp == NULL)
1136 goto done;
1137 }
1138 if (cache->ncp2 == NULL) {
1139 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp);
1140 if (ncp == NULL)
1141 goto done;
1142 }
1143 if (++cache->iter & 1)
1144 ncp = atomic_swap_ptr((void *)&cache->ncp2, ncp);
1145 else
1146 ncp = atomic_swap_ptr((void *)&cache->ncp1, ncp);
1147 if (ncp)
1148 _cache_drop(ncp);
1149done:
1150 nch->ncp = NULL;
1151 nch->mount = NULL;
1152}
1153
8edfbc5e
MD
1154/*
1155 * We are dropping what the caller believes is the current directory,
1156 * unconditionally store it in our pcpu cache. Anything already in
1157 * the cache will be discarded.
1158 */
1159void
1160cache_drop_ncdir(struct nchandle *nch)
1161{
1162 struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid];
1163
1164 nch->ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, nch->ncp);
1165 nch->mount = atomic_swap_ptr((void *)&cache->ncdir.mount, nch->mount);
1166 if (nch->ncp)
1167 _cache_drop(nch->ncp);
1168 if (nch->mount)
1169 _cache_mntrel(nch->mount);
1170 nch->ncp = NULL;
1171 nch->mount = NULL;
1172}
1173
79fd1696
MD
1174int
1175cache_lockstatus(struct nchandle *nch)
1176{
1177 return(_cache_lockstatus(nch->ncp));
1178}
1179
28623bf9
MD
1180void
1181cache_lock(struct nchandle *nch)
1182{
1183 _cache_lock(nch->ncp);
1184}
1185
79fd1696
MD
1186void
1187cache_lock_maybe_shared(struct nchandle *nch, int excl)
1188{
1189 struct namecache *ncp = nch->ncp;
1190
1191 if (ncp_shared_lock_disable || excl ||
1192 (ncp->nc_flag & NCF_UNRESOLVED)) {
1193 _cache_lock(ncp);
1194 } else {
1195 _cache_lock_shared(ncp);
1196 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1197 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1198 _cache_unlock(ncp);
1199 _cache_lock(ncp);
1200 }
1201 } else {
1202 _cache_unlock(ncp);
1203 _cache_lock(ncp);
1204 }
1205 }
1206}
1207
2247fe02
MD
1208/*
1209 * Relock nch1 given an unlocked nch1 and a locked nch2. The caller
1210 * is responsible for checking both for validity on return as they
1211 * may have become invalid.
1212 *
1213 * We have to deal with potential deadlocks here, just ping pong
1214 * the lock until we get it (we will always block somewhere when
1215 * looping so this is not cpu-intensive).
1216 *
1217 * which = 0 nch1 not locked, nch2 is locked
1218 * which = 1 nch1 is locked, nch2 is not locked
1219 */
1220void
1221cache_relock(struct nchandle *nch1, struct ucred *cred1,
1222 struct nchandle *nch2, struct ucred *cred2)
1223{
1224 int which;
1225
1226 which = 0;
1227
1228 for (;;) {
1229 if (which == 0) {
1230 if (cache_lock_nonblock(nch1) == 0) {
1231 cache_resolve(nch1, cred1);
1232 break;
1233 }
1234 cache_unlock(nch2);
1235 cache_lock(nch1);
1236 cache_resolve(nch1, cred1);
1237 which = 1;
1238 } else {
1239 if (cache_lock_nonblock(nch2) == 0) {
1240 cache_resolve(nch2, cred2);
1241 break;
1242 }
1243 cache_unlock(nch1);
1244 cache_lock(nch2);
1245 cache_resolve(nch2, cred2);
1246 which = 0;
1247 }
1248 }
1249}
1250
28623bf9
MD
1251int
1252cache_lock_nonblock(struct nchandle *nch)
1253{
1254 return(_cache_lock_nonblock(nch->ncp));
1255}
1256
28623bf9
MD
1257void
1258cache_unlock(struct nchandle *nch)
1259{
1260 _cache_unlock(nch->ncp);
1261}
1262
14c92d03 1263/*
690a3127 1264 * ref-and-lock, unlock-and-deref functions.
9b1b3591
MD
1265 *
1266 * This function is primarily used by nlookup. Even though cache_lock
1267 * holds the vnode, it is possible that the vnode may have already
f63911bf
MD
1268 * initiated a recyclement.
1269 *
1270 * We want cache_get() to return a definitively usable vnode or a
1271 * definitively unresolved ncp.
14c92d03 1272 */
28623bf9 1273static
21739618 1274struct namecache *
28623bf9 1275_cache_get(struct namecache *ncp)
690a3127
MD
1276{
1277 _cache_hold(ncp);
28623bf9 1278 _cache_lock(ncp);
9b1b3591 1279 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
28623bf9 1280 _cache_setunresolved(ncp);
21739618 1281 return(ncp);
690a3127
MD
1282}
1283
79fd1696
MD
1284/*
1285 * Attempt to obtain a shared lock on the ncp. A shared lock will only
1286 * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1287 * valid. Otherwise an exclusive lock will be acquired instead.
1288 */
1289static
1290struct namecache *
1291_cache_get_maybe_shared(struct namecache *ncp, int excl)
1292{
1293 if (ncp_shared_lock_disable || excl ||
1294 (ncp->nc_flag & NCF_UNRESOLVED)) {
1295 return(_cache_get(ncp));
1296 }
1297 _cache_hold(ncp);
1298 _cache_lock_shared(ncp);
1299 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1300 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1301 _cache_unlock(ncp);
1302 ncp = _cache_get(ncp);
1303 _cache_drop(ncp);
1304 }
1305 } else {
1306 _cache_unlock(ncp);
1307 ncp = _cache_get(ncp);
1308 _cache_drop(ncp);
1309 }
1310 return(ncp);
1311}
1312
28623bf9 1313/*
2247fe02 1314 * This is a special form of _cache_lock() which only succeeds if
f63911bf
MD
1315 * it can get a pristine, non-recursive lock. The caller must have
1316 * already ref'd the ncp.
1317 *
1318 * On success the ncp will be locked, on failure it will not. The
1319 * ref count does not change either way.
1320 *
2247fe02 1321 * We want _cache_lock_special() (on success) to return a definitively
f63911bf
MD
1322 * usable vnode or a definitively unresolved ncp.
1323 */
1324static int
2247fe02 1325_cache_lock_special(struct namecache *ncp)
f63911bf
MD
1326{
1327 if (_cache_lock_nonblock(ncp) == 0) {
79fd1696
MD
1328 if ((ncp->nc_lockstatus &
1329 ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) {
f63911bf
MD
1330 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1331 _cache_setunresolved(ncp);
1332 return(0);
1333 }
1334 _cache_unlock(ncp);
1335 }
1336 return(EWOULDBLOCK);
1337}
1338
737097c0
MD
1339/*
1340 * This function tries to get a shared lock but will back-off to an exclusive
1341 * lock if:
1342 *
1343 * (1) Some other thread is trying to obtain an exclusive lock
1344 * (to prevent the exclusive requester from getting livelocked out
1345 * by many shared locks).
1346 *
1347 * (2) The current thread already owns an exclusive lock (to avoid
1348 * deadlocking).
1349 *
1350 * WARNING! On machines with lots of cores we really want to try hard to
1351 * get a shared lock or concurrent path lookups can chain-react
1352 * into a very high-latency exclusive lock.
1353 */
79fd1696
MD
1354static int
1355_cache_lock_shared_special(struct namecache *ncp)
1356{
f4781244
MD
1357 /*
1358 * Only honor a successful shared lock (returning 0) if there is
1359 * no exclusive request pending and the vnode, if present, is not
1360 * in a reclaimed state.
1361 */
79fd1696 1362 if (_cache_lock_shared_nonblock(ncp) == 0) {
f4781244 1363 if ((ncp->nc_lockstatus & NC_EXLOCK_REQ) == 0) {
79fd1696
MD
1364 if (ncp->nc_vp == NULL ||
1365 (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
1366 return(0);
1367 }
1368 }
1369 _cache_unlock(ncp);
737097c0 1370 return(EWOULDBLOCK);
79fd1696 1371 }
f4781244
MD
1372
1373 /*
1374 * Non-blocking shared lock failed. If we already own the exclusive
1375 * lock just acquire another exclusive lock (instead of deadlocking).
1376 * Otherwise acquire a shared lock.
1377 */
737097c0
MD
1378 if (ncp->nc_locktd == curthread) {
1379 _cache_lock(ncp);
1380 return(0);
1381 }
1382 _cache_lock_shared(ncp);
1383 return(0);
79fd1696
MD
1384}
1385
f63911bf
MD
1386
1387/*
1388 * NOTE: The same nchandle can be passed for both arguments.
28623bf9
MD
1389 */
1390void
1391cache_get(struct nchandle *nch, struct nchandle *target)
1392{
f63911bf 1393 KKASSERT(nch->ncp->nc_refs > 0);
28623bf9
MD
1394 target->mount = nch->mount;
1395 target->ncp = _cache_get(nch->ncp);
3536c341 1396 _cache_mntref(target->mount);
28623bf9
MD
1397}
1398
79fd1696
MD
1399void
1400cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1401{
1402 KKASSERT(nch->ncp->nc_refs > 0);
1403 target->mount = nch->mount;
1404 target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
3536c341 1405 _cache_mntref(target->mount);
79fd1696
MD
1406}
1407
2247fe02 1408/*
79fd1696 1409 *
2247fe02 1410 */
28623bf9 1411static __inline
14c92d03 1412void
28623bf9 1413_cache_put(struct namecache *ncp)
14c92d03 1414{
28623bf9 1415 _cache_unlock(ncp);
14c92d03
MD
1416 _cache_drop(ncp);
1417}
1418
2247fe02 1419/*
79fd1696 1420 *
2247fe02 1421 */
28623bf9
MD
1422void
1423cache_put(struct nchandle *nch)
1424{
3536c341 1425 _cache_mntrel(nch->mount);
28623bf9
MD
1426 _cache_put(nch->ncp);
1427 nch->ncp = NULL;
1428 nch->mount = NULL;
1429}
1430
690a3127
MD
1431/*
1432 * Resolve an unresolved ncp by associating a vnode with it. If the
1433 * vnode is NULL, a negative cache entry is created.
1434 *
1435 * The ncp should be locked on entry and will remain locked on return.
1436 */
28623bf9 1437static
690a3127 1438void
4b5bbb78 1439_cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
ce6da7e4 1440{
690a3127 1441 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
79fd1696 1442 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
2247fe02 1443
ce6da7e4 1444 if (vp != NULL) {
21739618
MD
1445 /*
1446 * Any vp associated with an ncp which has children must
55361147 1447 * be held. Any vp associated with a locked ncp must be held.
21739618
MD
1448 */
1449 if (!TAILQ_EMPTY(&ncp->nc_list))
1450 vhold(vp);
b12defdc 1451 spin_lock(&vp->v_spin);
f63911bf 1452 ncp->nc_vp = vp;
ce6da7e4 1453 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
b12defdc 1454 spin_unlock(&vp->v_spin);
79fd1696 1455 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
55361147 1456 vhold(vp);
21739618
MD
1457
1458 /*
3c37c940 1459 * Set auxiliary flags
21739618 1460 */
690a3127
MD
1461 switch(vp->v_type) {
1462 case VDIR:
21739618
MD
1463 ncp->nc_flag |= NCF_ISDIR;
1464 break;
690a3127 1465 case VLNK:
21739618
MD
1466 ncp->nc_flag |= NCF_ISSYMLINK;
1467 /* XXX cache the contents of the symlink */
1468 break;
690a3127 1469 default:
21739618 1470 break;
690a3127 1471 }
21739618 1472 ncp->nc_error = 0;
f423d507
FT
1473 /* XXX: this is a hack to work-around the lack of a real pfs vfs
1474 * implementation*/
1475 if (mp != NULL)
35d98733
FT
1476 if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1477 vp->v_pfsmp = mp;
ce6da7e4 1478 } else {
4b5bbb78
MD
1479 /*
1480 * When creating a negative cache hit we set the
1481 * namecache_gen. A later resolve will clean out the
1482 * negative cache hit if the mount point's namecache_gen
1483 * has changed. Used by devfs, could also be used by
1484 * other remote FSs.
1485 */
bf3f67a7
MD
1486 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1487
f63911bf 1488 ncp->nc_vp = NULL;
bf3f67a7
MD
1489 ncp->nc_negcpu = mycpu->gd_cpuid;
1490 spin_lock(&pn->neg_spin);
1491 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
1492 ++pn->neg_count;
1493 spin_unlock(&pn->neg_spin);
1494 atomic_add_long(&pn->vfscache_negs, 1);
1495
21739618 1496 ncp->nc_error = ENOENT;
4b5bbb78 1497 if (mp)
75779c3c 1498 VFS_NCPGEN_SET(mp, ncp);
ce6da7e4 1499 }
65870584 1500 ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
ce6da7e4
MD
1501}
1502
2247fe02 1503/*
79fd1696 1504 *
2247fe02 1505 */
fad57d0e 1506void
28623bf9 1507cache_setvp(struct nchandle *nch, struct vnode *vp)
fad57d0e 1508{
4b5bbb78 1509 _cache_setvp(nch->mount, nch->ncp, vp);
28623bf9
MD
1510}
1511
2247fe02 1512/*
79fd1696 1513 *
2247fe02 1514 */
28623bf9
MD
1515void
1516cache_settimeout(struct nchandle *nch, int nticks)
1517{
1518 struct namecache *ncp = nch->ncp;
1519
fad57d0e
MD
1520 if ((ncp->nc_timeout = ticks + nticks) == 0)
1521 ncp->nc_timeout = 1;
1522}
1523
690a3127
MD
1524/*
1525 * Disassociate the vnode or negative-cache association and mark a
1526 * namecache entry as unresolved again. Note that the ncp is still
1527 * left in the hash table and still linked to its parent.
1528 *
67773eb3
MD
1529 * The ncp should be locked and refd on entry and will remain locked and refd
1530 * on return.
8c361dda
MD
1531 *
1532 * This routine is normally never called on a directory containing children.
1533 * However, NFS often does just that in its rename() code as a cop-out to
1534 * avoid complex namespace operations. This disconnects a directory vnode
1535 * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1536 * sync.
2247fe02 1537 *
690a3127 1538 */
28623bf9 1539static
690a3127 1540void
28623bf9 1541_cache_setunresolved(struct namecache *ncp)
14c92d03 1542{
690a3127 1543 struct vnode *vp;
14c92d03 1544
690a3127
MD
1545 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1546 ncp->nc_flag |= NCF_UNRESOLVED;
fad57d0e 1547 ncp->nc_timeout = 0;
690a3127 1548 ncp->nc_error = ENOTCONN;
690a3127 1549 if ((vp = ncp->nc_vp) != NULL) {
b12defdc 1550 spin_lock(&vp->v_spin);
fad57d0e 1551 ncp->nc_vp = NULL;
690a3127 1552 TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
b12defdc 1553 spin_unlock(&vp->v_spin);
55361147
MD
1554
1555 /*
1556 * Any vp associated with an ncp with children is
1557 * held by that ncp. Any vp associated with a locked
1558 * ncp is held by that ncp. These conditions must be
1559 * undone when the vp is cleared out from the ncp.
1560 */
690a3127
MD
1561 if (!TAILQ_EMPTY(&ncp->nc_list))
1562 vdrop(vp);
79fd1696 1563 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
55361147 1564 vdrop(vp);
690a3127 1565 } else {
bf3f67a7
MD
1566 struct pcpu_ncache *pn;
1567
1568 pn = &pcpu_ncache[ncp->nc_negcpu];
1569
1570 atomic_add_long(&pn->vfscache_negs, -1);
1571 spin_lock(&pn->neg_spin);
1572 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
1573 --pn->neg_count;
1574 spin_unlock(&pn->neg_spin);
690a3127 1575 }
d98152a8 1576 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
8e005a45
MD
1577 }
1578}
8c361dda 1579
4b5bbb78
MD
1580/*
1581 * The cache_nresolve() code calls this function to automatically
1582 * set a resolved cache element to unresolved if it has timed out
1583 * or if it is a negative cache hit and the mount point namecache_gen
1584 * has changed.
1585 */
79fd1696
MD
1586static __inline int
1587_cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
4b5bbb78 1588{
4b5bbb78
MD
1589 /*
1590 * Try to zap entries that have timed out. We have
1591 * to be careful here because locked leafs may depend
1592 * on the vnode remaining intact in a parent, so only
1593 * do this under very specific conditions.
1594 */
1595 if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1596 TAILQ_EMPTY(&ncp->nc_list)) {
79fd1696 1597 return 1;
4b5bbb78
MD
1598 }
1599
1600 /*
1601 * If a resolved negative cache hit is invalid due to
1602 * the mount's namecache generation being bumped, zap it.
1603 */
75779c3c 1604 if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
79fd1696
MD
1605 return 1;
1606 }
1607
1608 /*
1609 * Otherwise we are good
1610 */
1611 return 0;
1612}
1613
1614static __inline void
1615_cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1616{
1617 /*
1618 * Already in an unresolved state, nothing to do.
1619 */
1620 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1621 if (_cache_auto_unresolve_test(mp, ncp))
1622 _cache_setunresolved(ncp);
4b5bbb78
MD
1623 }
1624}
1625
2247fe02 1626/*
79fd1696 1627 *
2247fe02 1628 */
1d505369 1629void
28623bf9 1630cache_setunresolved(struct nchandle *nch)
1d505369 1631{
28623bf9 1632 _cache_setunresolved(nch->ncp);
1d505369
MD
1633}
1634
1635/*
28623bf9
MD
1636 * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1637 * looking for matches. This flag tells the lookup code when it must
1638 * check for a mount linkage and also prevents the directories in question
1639 * from being deleted or renamed.
1d505369 1640 */
28623bf9
MD
1641static
1642int
1643cache_clrmountpt_callback(struct mount *mp, void *data)
1644{
1645 struct nchandle *nch = data;
1646
1647 if (mp->mnt_ncmounton.ncp == nch->ncp)
1648 return(1);
1649 if (mp->mnt_ncmountpt.ncp == nch->ncp)
1650 return(1);
1651 return(0);
1652}
1653
2247fe02 1654/*
fb578eac
MD
1655 * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated
1656 * with a mount point.
2247fe02 1657 */
1d505369 1658void
28623bf9 1659cache_clrmountpt(struct nchandle *nch)
1d505369 1660{
28623bf9
MD
1661 int count;
1662
1663 count = mountlist_scan(cache_clrmountpt_callback, nch,
1664 MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1665 if (count == 0)
1666 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1d505369
MD
1667}
1668
8e005a45 1669/*
e09206ba
MD
1670 * Invalidate portions of the namecache topology given a starting entry.
1671 * The passed ncp is set to an unresolved state and:
8e005a45 1672 *
2247fe02
MD
1673 * The passed ncp must be referencxed and locked. The routine may unlock
1674 * and relock ncp several times, and will recheck the children and loop
1675 * to catch races. When done the passed ncp will be returned with the
1676 * reference and lock intact.
e09206ba
MD
1677 *
1678 * CINV_DESTROY - Set a flag in the passed ncp entry indicating
1679 * that the physical underlying nodes have been
1680 * destroyed... as in deleted. For example, when
1681 * a directory is removed. This will cause record
1682 * lookups on the name to no longer be able to find
1683 * the record and tells the resolver to return failure
1684 * rather then trying to resolve through the parent.
1685 *
1686 * The topology itself, including ncp->nc_name,
1687 * remains intact.
1688 *
1689 * This only applies to the passed ncp, if CINV_CHILDREN
1690 * is specified the children are not flagged.
1691 *
1692 * CINV_CHILDREN - Set all children (recursively) to an unresolved
1693 * state as well.
1694 *
1695 * Note that this will also have the side effect of
1696 * cleaning out any unreferenced nodes in the topology
1697 * from the leaves up as the recursion backs out.
1698 *
2247fe02
MD
1699 * Note that the topology for any referenced nodes remains intact, but
1700 * the nodes will be marked as having been destroyed and will be set
1701 * to an unresolved state.
25cb3304
MD
1702 *
1703 * It is possible for cache_inval() to race a cache_resolve(), meaning that
1704 * the namecache entry may not actually be invalidated on return if it was
1705 * revalidated while recursing down into its children. This code guarentees
1706 * that the node(s) will go through an invalidation cycle, but does not
1707 * guarentee that they will remain in an invalidated state.
1708 *
1709 * Returns non-zero if a revalidation was detected during the invalidation
1710 * recursion, zero otherwise. Note that since only the original ncp is
1711 * locked the revalidation ultimately can only indicate that the original ncp
1712 * *MIGHT* no have been reresolved.
bf40a153
MD
1713 *
1714 * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1715 * have to avoid blowing out the kernel stack. We do this by saving the
1716 * deep namecache node and aborting the recursion, then re-recursing at that
1717 * node using a depth-first algorithm in order to allow multiple deep
1718 * recursions to chain through each other, then we restart the invalidation
1719 * from scratch.
8e005a45 1720 */
bf40a153
MD
1721
1722struct cinvtrack {
1723 struct namecache *resume_ncp;
1724 int depth;
1725};
1726
28623bf9 1727static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
bf40a153 1728
28623bf9 1729static
25cb3304 1730int
28623bf9 1731_cache_inval(struct namecache *ncp, int flags)
bf40a153
MD
1732{
1733 struct cinvtrack track;
1734 struct namecache *ncp2;
1735 int r;
1736
1737 track.depth = 0;
1738 track.resume_ncp = NULL;
1739
1740 for (;;) {
28623bf9 1741 r = _cache_inval_internal(ncp, flags, &track);
bf40a153
MD
1742 if (track.resume_ncp == NULL)
1743 break;
28623bf9 1744 _cache_unlock(ncp);
bf40a153
MD
1745 while ((ncp2 = track.resume_ncp) != NULL) {
1746 track.resume_ncp = NULL;
28623bf9
MD
1747 _cache_lock(ncp2);
1748 _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
bf40a153 1749 &track);
28623bf9 1750 _cache_put(ncp2);
bf40a153 1751 }
28623bf9 1752 _cache_lock(ncp);
bf40a153
MD
1753 }
1754 return(r);
1755}
1756
28623bf9
MD
1757int
1758cache_inval(struct nchandle *nch, int flags)
1759{
1760 return(_cache_inval(nch->ncp, flags));
1761}
1762
2247fe02
MD
1763/*
1764 * Helper for _cache_inval(). The passed ncp is refd and locked and
1765 * remains that way on return, but may be unlocked/relocked multiple
1766 * times by the routine.
1767 */
bf40a153 1768static int
28623bf9 1769_cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
8e005a45 1770{
b8997912 1771 struct namecache *nextkid;
25cb3304 1772 int rcnt = 0;
8e005a45 1773
79fd1696 1774 KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
25cb3304 1775
28623bf9 1776 _cache_setunresolved(ncp);
75e479cf 1777 if (flags & CINV_DESTROY) {
e09206ba 1778 ncp->nc_flag |= NCF_DESTROYED;
75e479cf
MD
1779 ++ncp->nc_generation;
1780 }
f0181d63
MD
1781 while ((flags & CINV_CHILDREN) &&
1782 (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL
e09206ba 1783 ) {
f0181d63
MD
1784 struct namecache *kid;
1785 int restart;
1786
1787 restart = 0;
1788 _cache_hold(nextkid);
bf40a153
MD
1789 if (++track->depth > MAX_RECURSION_DEPTH) {
1790 track->resume_ncp = ncp;
28623bf9 1791 _cache_hold(ncp);
bf40a153
MD
1792 ++rcnt;
1793 }
f0181d63
MD
1794 while ((kid = nextkid) != NULL) {
1795 /*
1796 * Parent (ncp) must be locked for the iteration.
1797 */
1798 nextkid = NULL;
1799 if (kid->nc_parent != ncp) {
28623bf9 1800 _cache_drop(kid);
8f73b9ee
MD
1801 kprintf("cache_inval_internal restartA %s\n",
1802 ncp->nc_name);
f0181d63 1803 restart = 1;
bf40a153
MD
1804 break;
1805 }
b8997912 1806 if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
28623bf9 1807 _cache_hold(nextkid);
f0181d63
MD
1808
1809 /*
1810 * Parent unlocked for this section to avoid
1811 * deadlocks.
1812 */
1813 _cache_unlock(ncp);
1814 if (track->resume_ncp) {
1815 _cache_drop(kid);
1816 _cache_lock(ncp);
1817 break;
1818 }
e09206ba
MD
1819 if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1820 TAILQ_FIRST(&kid->nc_list)
b8997912 1821 ) {
28623bf9 1822 _cache_lock(kid);
f0181d63 1823 if (kid->nc_parent != ncp) {
8f73b9ee
MD
1824 kprintf("cache_inval_internal "
1825 "restartB %s\n",
1826 ncp->nc_name);
f0181d63
MD
1827 restart = 1;
1828 _cache_unlock(kid);
1829 _cache_drop(kid);
1830 _cache_lock(ncp);
1831 break;
1832 }
1833
28623bf9
MD
1834 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
1835 _cache_unlock(kid);
b8997912 1836 }
28623bf9 1837 _cache_drop(kid);
f0181d63 1838 _cache_lock(ncp);
8e005a45 1839 }
f0181d63
MD
1840 if (nextkid)
1841 _cache_drop(nextkid);
bf40a153 1842 --track->depth;
f0181d63
MD
1843 if (restart == 0)
1844 break;
8e005a45 1845 }
25cb3304
MD
1846
1847 /*
1848 * Someone could have gotten in there while ncp was unlocked,
1849 * retry if so.
1850 */
1851 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1852 ++rcnt;
1853 return (rcnt);
8e005a45
MD
1854}
1855
e09206ba 1856/*
25cb3304
MD
1857 * Invalidate a vnode's namecache associations. To avoid races against
1858 * the resolver we do not invalidate a node which we previously invalidated
1859 * but which was then re-resolved while we were in the invalidation loop.
1860 *
1861 * Returns non-zero if any namecache entries remain after the invalidation
1862 * loop completed.
2aefb2c5 1863 *
f63911bf
MD
1864 * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1865 * be ripped out of the topology while held, the vnode's v_namecache
1866 * list has no such restriction. NCP's can be ripped out of the list
1867 * at virtually any time if not locked, even if held.
1868 *
1869 * In addition, the v_namecache list itself must be locked via
1870 * the vnode's spinlock.
e09206ba 1871 */
25cb3304 1872int
6b008938 1873cache_inval_vp(struct vnode *vp, int flags)
8e005a45
MD
1874{
1875 struct namecache *ncp;
25cb3304
MD
1876 struct namecache *next;
1877
2aefb2c5 1878restart:
b12defdc 1879 spin_lock(&vp->v_spin);
25cb3304
MD
1880 ncp = TAILQ_FIRST(&vp->v_namecache);
1881 if (ncp)
28623bf9 1882 _cache_hold(ncp);
25cb3304 1883 while (ncp) {
f63911bf 1884 /* loop entered with ncp held and vp spin-locked */
2aefb2c5 1885 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
28623bf9 1886 _cache_hold(next);
b12defdc 1887 spin_unlock(&vp->v_spin);
28623bf9 1888 _cache_lock(ncp);
2aefb2c5 1889 if (ncp->nc_vp != vp) {
6ea70f76 1890 kprintf("Warning: cache_inval_vp: race-A detected on "
2aefb2c5 1891 "%s\n", ncp->nc_name);
28623bf9 1892 _cache_put(ncp);
69313361 1893 if (next)
28623bf9 1894 _cache_drop(next);
2aefb2c5
MD
1895 goto restart;
1896 }
28623bf9
MD
1897 _cache_inval(ncp, flags);
1898 _cache_put(ncp); /* also releases reference */
25cb3304 1899 ncp = next;
b12defdc 1900 spin_lock(&vp->v_spin);
2aefb2c5 1901 if (ncp && ncp->nc_vp != vp) {
b12defdc 1902 spin_unlock(&vp->v_spin);
6ea70f76 1903 kprintf("Warning: cache_inval_vp: race-B detected on "
2aefb2c5 1904 "%s\n", ncp->nc_name);
28623bf9 1905 _cache_drop(ncp);
2aefb2c5
MD
1906 goto restart;
1907 }
690a3127 1908 }
b12defdc 1909 spin_unlock(&vp->v_spin);
25cb3304 1910 return(TAILQ_FIRST(&vp->v_namecache) != NULL);
5c6c3cac
MD
1911}
1912
1913/*
1914 * This routine is used instead of the normal cache_inval_vp() when we
1915 * are trying to recycle otherwise good vnodes.
1916 *
1917 * Return 0 on success, non-zero if not all namecache records could be
1918 * disassociated from the vnode (for various reasons).
1919 */
1920int
1921cache_inval_vp_nonblock(struct vnode *vp)
1922{
1923 struct namecache *ncp;
1924 struct namecache *next;
1925
b12defdc 1926 spin_lock(&vp->v_spin);
5c6c3cac
MD
1927 ncp = TAILQ_FIRST(&vp->v_namecache);
1928 if (ncp)
1929 _cache_hold(ncp);
1930 while (ncp) {
1931 /* loop entered with ncp held */
1932 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1933 _cache_hold(next);
b12defdc 1934 spin_unlock(&vp->v_spin);
5c6c3cac
MD
1935 if (_cache_lock_nonblock(ncp)) {
1936 _cache_drop(ncp);
1937 if (next)
1938 _cache_drop(next);
2247fe02 1939 goto done;
5c6c3cac
MD
1940 }
1941 if (ncp->nc_vp != vp) {
1942 kprintf("Warning: cache_inval_vp: race-A detected on "
1943 "%s\n", ncp->nc_name);
1944 _cache_put(ncp);
1945 if (next)
1946 _cache_drop(next);
2247fe02 1947 goto done;
5c6c3cac
MD
1948 }
1949 _cache_inval(ncp, 0);
1950 _cache_put(ncp); /* also releases reference */
1951 ncp = next;
b12defdc 1952 spin_lock(&vp->v_spin);
5c6c3cac 1953 if (ncp && ncp->nc_vp != vp) {
b12defdc 1954 spin_unlock(&vp->v_spin);
5c6c3cac
MD
1955 kprintf("Warning: cache_inval_vp: race-B detected on "
1956 "%s\n", ncp->nc_name);
1957 _cache_drop(ncp);
2247fe02 1958 goto done;
5c6c3cac
MD
1959 }
1960 }
b12defdc 1961 spin_unlock(&vp->v_spin);
2247fe02 1962done:
5c6c3cac 1963 return(TAILQ_FIRST(&vp->v_namecache) != NULL);
14c92d03 1964}
14c92d03 1965
8edfbc5e
MD
1966/*
1967 * Clears the universal directory search 'ok' flag. This flag allows
1968 * nlookup() to bypass normal vnode checks. This flag is a cached flag
1969 * so clearing it simply forces revalidation.
1970 */
1971void
1972cache_inval_wxok(struct vnode *vp)
1973{
1974 struct namecache *ncp;
1975
1976 spin_lock(&vp->v_spin);
1977 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1978 if (ncp->nc_flag & NCF_WXOK)
1979 atomic_clear_short(&ncp->nc_flag, NCF_WXOK);
1980 }
1981 spin_unlock(&vp->v_spin);
1982}
1983
fad57d0e
MD
1984/*
1985 * The source ncp has been renamed to the target ncp. Both fncp and tncp
227cf16d
MD
1986 * must be locked. The target ncp is destroyed (as a normal rename-over
1987 * would destroy the target file or directory).
fad57d0e 1988 *
227cf16d
MD
1989 * Because there may be references to the source ncp we cannot copy its
1990 * contents to the target. Instead the source ncp is relinked as the target
1991 * and the target ncp is removed from the namecache topology.
fad57d0e
MD
1992 */
1993void
28623bf9 1994cache_rename(struct nchandle *fnch, struct nchandle *tnch)
fad57d0e 1995{
28623bf9
MD
1996 struct namecache *fncp = fnch->ncp;
1997 struct namecache *tncp = tnch->ncp;
2247fe02
MD
1998 struct namecache *tncp_par;
1999 struct nchash_head *nchpp;
2000 u_int32_t hash;
227cf16d 2001 char *oname;
8d09ad3d
MD
2002 char *nname;
2003
75e479cf
MD
2004 ++fncp->nc_generation;
2005 ++tncp->nc_generation;
8d09ad3d
MD
2006 if (tncp->nc_nlen) {
2007 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
2008 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
2009 nname[tncp->nc_nlen] = 0;
2010 } else {
2011 nname = NULL;
2012 }
fad57d0e 2013
2247fe02
MD
2014 /*
2015 * Rename fncp (unlink)
2016 */
f63911bf 2017 _cache_unlink_parent(fncp);
227cf16d 2018 oname = fncp->nc_name;
8d09ad3d 2019 fncp->nc_name = nname;
227cf16d 2020 fncp->nc_nlen = tncp->nc_nlen;
8d09ad3d
MD
2021 if (oname)
2022 kfree(oname, M_VFSCACHE);
2023
2247fe02
MD
2024 tncp_par = tncp->nc_parent;
2025 _cache_hold(tncp_par);
2026 _cache_lock(tncp_par);
2027
2028 /*
2029 * Rename fncp (relink)
2030 */
2031 hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
2032 hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
2033 nchpp = NCHHASH(hash);
2034
287a8577 2035 spin_lock(&nchpp->spin);
2247fe02 2036 _cache_link_parent(fncp, tncp_par, nchpp);
287a8577 2037 spin_unlock(&nchpp->spin);
2247fe02
MD
2038
2039 _cache_put(tncp_par);
2040
2041 /*
2042 * Get rid of the overwritten tncp (unlink)
2043 */
8d09ad3d
MD
2044 _cache_unlink(tncp);
2045}
f63911bf 2046
8d09ad3d 2047/*
245d8a0d
MD
2048 * Perform actions consistent with unlinking a file. The passed-in ncp
2049 * must be locked.
2050 *
2051 * The ncp is marked DESTROYED so it no longer shows up in searches,
8d09ad3d 2052 * and will be physically deleted when the vnode goes away.
245d8a0d
MD
2053 *
2054 * If the related vnode has no refs then we cycle it through vget()/vput()
2055 * to (possibly if we don't have a ref race) trigger a deactivation,
2056 * allowing the VFS to trivially detect and recycle the deleted vnode
2057 * via VOP_INACTIVE().
2058 *
2059 * NOTE: _cache_rename() will automatically call _cache_unlink() on the
2060 * target ncp.
8d09ad3d
MD
2061 */
2062void
2063cache_unlink(struct nchandle *nch)
2064{
2065 _cache_unlink(nch->ncp);
2066}
2067
2068static void
2069_cache_unlink(struct namecache *ncp)
2070{
245d8a0d
MD
2071 struct vnode *vp;
2072
2073 /*
2074 * Causes lookups to fail and allows another ncp with the same
2075 * name to be created under ncp->nc_parent.
2076 */
8d09ad3d 2077 ncp->nc_flag |= NCF_DESTROYED;
75e479cf 2078 ++ncp->nc_generation;
245d8a0d
MD
2079
2080 /*
1ddc3e8f 2081 * Attempt to trigger a deactivation. Set VREF_FINALIZE to
ee173d09 2082 * force action on the 1->0 transition.
245d8a0d
MD
2083 */
2084 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
ee173d09
SW
2085 (vp = ncp->nc_vp) != NULL) {
2086 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
2087 if (VREFCNT(vp) <= 0) {
2088 if (vget(vp, LK_SHARED) == 0)
2089 vput(vp);
2090 }
245d8a0d 2091 }
fad57d0e
MD
2092}
2093
1ddc3e8f
MD
2094/*
2095 * Return non-zero if the nch might be associated with an open and/or mmap()'d
2096 * file. The easy solution is to just return non-zero if the vnode has refs.
2097 * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
2098 * force the reclaim).
2099 */
2100int
2101cache_isopen(struct nchandle *nch)
2102{
2103 struct vnode *vp;
2104 struct namecache *ncp = nch->ncp;
2105
2106 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2107 (vp = ncp->nc_vp) != NULL &&
2108 VREFCNT(vp)) {
2109 return 1;
2110 }
2111 return 0;
2112}
2113
2114
21739618
MD
2115/*
2116 * vget the vnode associated with the namecache entry. Resolve the namecache
12cdc371
MD
2117 * entry if necessary. The passed ncp must be referenced and locked. If
2118 * the ncp is resolved it might be locked shared.
21739618
MD
2119 *
2120 * lk_type may be LK_SHARED, LK_EXCLUSIVE. A ref'd, possibly locked
2121 * (depending on the passed lk_type) will be returned in *vpp with an error
2122 * of 0, or NULL will be returned in *vpp with a non-0 error code. The
2123 * most typical error is ENOENT, meaning that the ncp represents a negative
2124 * cache hit and there is no vnode to retrieve, but other errors can occur
2125 * too.
2126 *
2247fe02
MD
2127 * The vget() can race a reclaim. If this occurs we re-resolve the
2128 * namecache entry.
2129 *
2130 * There are numerous places in the kernel where vget() is called on a
2131 * vnode while one or more of its namecache entries is locked. Releasing
2132 * a vnode never deadlocks against locked namecache entries (the vnode
2133 * will not get recycled while referenced ncp's exist). This means we
2134 * can safely acquire the vnode. In fact, we MUST NOT release the ncp
2135 * lock when acquiring the vp lock or we might cause a deadlock.
2136 *
79fd1696
MD
2137 * NOTE: The passed-in ncp must be locked exclusively if it is initially
2138 * unresolved. If a reclaim race occurs the passed-in ncp will be
2139 * relocked exclusively before being re-resolved.
21739618
MD
2140 */
2141int
28623bf9 2142cache_vget(struct nchandle *nch, struct ucred *cred,
21739618
MD
2143 int lk_type, struct vnode **vpp)
2144{
28623bf9 2145 struct namecache *ncp;
21739618
MD
2146 struct vnode *vp;
2147 int error;
2148
28623bf9 2149 ncp = nch->ncp;
21739618
MD
2150again:
2151 vp = NULL;
2247fe02 2152 if (ncp->nc_flag & NCF_UNRESOLVED)
28623bf9 2153 error = cache_resolve(nch, cred);
2247fe02 2154 else
21739618 2155 error = 0;
2247fe02 2156
21739618 2157 if (error == 0 && (vp = ncp->nc_vp) != NULL) {
87de5057 2158 error = vget(vp, lk_type);
21739618 2159 if (error) {
2247fe02
MD
2160 /*
2161 * VRECLAIM race
12cdc371
MD
2162 *
2163 * The ncp may have been locked shared, we must relock
2164 * it exclusively before we can set it to unresolved.
2247fe02
MD
2165 */
2166 if (error == ENOENT) {
2167 kprintf("Warning: vnode reclaim race detected "
2168 "in cache_vget on %p (%s)\n",
2169 vp, ncp->nc_name);
79fd1696
MD
2170 _cache_unlock(ncp);
2171 _cache_lock(ncp);
2247fe02 2172 _cache_setunresolved(ncp);
21739618 2173 goto again;
2247fe02
MD
2174 }
2175
2176 /*
2177 * Not a reclaim race, some other error.
2178 */
2179 KKASSERT(ncp->nc_vp == vp);
21739618 2180 vp = NULL;
2247fe02
MD
2181 } else {
2182 KKASSERT(ncp->nc_vp == vp);
2183 KKASSERT((vp->v_flag & VRECLAIMED) == 0);
21739618
MD
2184 }
2185 }
2186 if (error == 0 && vp == NULL)
2187 error = ENOENT;
2188 *vpp = vp;
2189 return(error);
2190}
2191
79fd1696
MD
2192/*
2193 * Similar to cache_vget() but only acquires a ref on the vnode.
2194 *
2195 * NOTE: The passed-in ncp must be locked exclusively if it is initially
2196 * unresolved. If a reclaim race occurs the passed-in ncp will be
2197 * relocked exclusively before being re-resolved.
2198 */
21739618 2199int
28623bf9 2200cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
21739618 2201{
28623bf9 2202 struct namecache *ncp;
21739618
MD
2203 struct vnode *vp;
2204 int error;
2205
28623bf9 2206 ncp = nch->ncp;
21739618
MD
2207again:
2208 vp = NULL;
2247fe02 2209 if (ncp->nc_flag & NCF_UNRESOLVED)
28623bf9 2210 error = cache_resolve(nch, cred);
2247fe02 2211 else
21739618 2212 error = 0;
2247fe02 2213
21739618 2214 if (error == 0 && (vp = ncp->nc_vp) != NULL) {
2247fe02
MD
2215 error = vget(vp, LK_SHARED);
2216 if (error) {
2217 /*
2218 * VRECLAIM race
2219 */
3c37c940 2220 if (error == ENOENT) {
2247fe02
MD
2221 kprintf("Warning: vnode reclaim race detected "
2222 "in cache_vget on %p (%s)\n",
2223 vp, ncp->nc_name);
79fd1696
MD
2224 _cache_unlock(ncp);
2225 _cache_lock(ncp);
3c37c940 2226 _cache_setunresolved(ncp);
3c37c940
MD
2227 goto again;
2228 }
2247fe02
MD
2229
2230 /*
2231 * Not a reclaim race, some other error.
2232 */
2233 KKASSERT(ncp->nc_vp == vp);
2234 vp = NULL;
3c37c940 2235 } else {
2247fe02
MD
2236 KKASSERT(ncp->nc_vp == vp);
2237 KKASSERT((vp->v_flag & VRECLAIMED) == 0);
3c37c940
MD
2238 /* caller does not want a lock */
2239 vn_unlock(vp);
21739618
MD
2240 }
2241 }
2242 if (error == 0 && vp == NULL)
2243 error = ENOENT;
2244 *vpp = vp;
2245 return(error);
2246}
2247
c0c70b27
MD
2248/*
2249 * Return a referenced vnode representing the parent directory of
f63911bf
MD
2250 * ncp.
2251 *
2252 * Because the caller has locked the ncp it should not be possible for
2253 * the parent ncp to go away. However, the parent can unresolve its
2254 * dvp at any time so we must be able to acquire a lock on the parent
2255 * to safely access nc_vp.
5312fa43 2256 *
f63911bf
MD
2257 * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2258 * so use vhold()/vdrop() while holding the lock to prevent dvp from
2259 * getting destroyed.
2247fe02 2260 *
79fd1696
MD
2261 * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2262 * lock on the ncp in question..
c0c70b27 2263 */
5312fa43 2264static struct vnode *
c0c70b27
MD
2265cache_dvpref(struct namecache *ncp)
2266{
5312fa43 2267 struct namecache *par;
c0c70b27 2268 struct vnode *dvp;
c0c70b27 2269
5312fa43
MD
2270 dvp = NULL;
2271 if ((par = ncp->nc_parent) != NULL) {
f63911bf 2272 _cache_hold(par);
2247fe02
MD
2273 _cache_lock(par);
2274 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
2275 if ((dvp = par->nc_vp) != NULL)
2276 vhold(dvp);
2277 }
2278 _cache_unlock(par);
2279 if (dvp) {
2280 if (vget(dvp, LK_SHARED) == 0) {
2281 vn_unlock(dvp);
2282 vdrop(dvp);
2283 /* return refd, unlocked dvp */
2284 } else {
2285 vdrop(dvp);
2286 dvp = NULL;
5312fa43
MD
2287 }
2288 }
f63911bf 2289 _cache_drop(par);
5312fa43
MD
2290 }
2291 return(dvp);
c0c70b27
MD
2292}
2293
fad57d0e
MD
2294/*
2295 * Convert a directory vnode to a namecache record without any other
2296 * knowledge of the topology. This ONLY works with directory vnodes and
2297 * is ONLY used by the NFS server. dvp must be refd but unlocked, and the
2298 * returned ncp (if not NULL) will be held and unlocked.
2299 *
2300 * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2301 * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2302 * for dvp. This will fail only if the directory has been deleted out from
2303 * under the caller.
2304 *
2305 * Callers must always check for a NULL return no matter the value of 'makeit'.
a0d57516
MD
2306 *
2307 * To avoid underflowing the kernel stack each recursive call increments
2308 * the makeit variable.
fad57d0e
MD
2309 */
2310
28623bf9 2311static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
33387738 2312 struct vnode *dvp, char *fakename);
a0d57516 2313static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
cc4c3b52 2314 struct vnode **saved_dvp);
fad57d0e 2315
28623bf9
MD
2316int
2317cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2318 struct nchandle *nch)
fad57d0e 2319{
cc4c3b52 2320 struct vnode *saved_dvp;
fad57d0e 2321 struct vnode *pvp;
33387738 2322 char *fakename;
fad57d0e
MD
2323 int error;
2324
28623bf9
MD
2325 nch->ncp = NULL;
2326 nch->mount = dvp->v_mount;
cc4c3b52 2327 saved_dvp = NULL;
33387738 2328 fakename = NULL;
a0d57516 2329
269a08e4
MD
2330 /*
2331 * Handle the makeit == 0 degenerate case
2332 */
2333 if (makeit == 0) {
b8cd5817 2334 spin_lock_shared(&dvp->v_spin);
269a08e4
MD
2335 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2336 if (nch->ncp)
2337 cache_hold(nch);
b8cd5817 2338 spin_unlock_shared(&dvp->v_spin);
269a08e4
MD
2339 }
2340
fad57d0e 2341 /*
f63911bf 2342 * Loop until resolution, inside code will break out on error.
fad57d0e 2343 */
f63911bf
MD
2344 while (makeit) {
2345 /*
2346 * Break out if we successfully acquire a working ncp.
2347 */
b8cd5817 2348 spin_lock_shared(&dvp->v_spin);
28623bf9 2349 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
f63911bf
MD
2350 if (nch->ncp) {
2351 cache_hold(nch);
b8cd5817 2352 spin_unlock_shared(&dvp->v_spin);
f63911bf
MD
2353 break;
2354 }
b8cd5817 2355 spin_unlock_shared(&dvp->v_spin);
fad57d0e 2356
fad57d0e
MD
2357 /*
2358 * If dvp is the root of its filesystem it should already
2359 * have a namecache pointer associated with it as a side
2360 * effect of the mount, but it may have been disassociated.
2361 */
2362 if (dvp->v_flag & VROOT) {
28623bf9
MD
2363 nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2364 error = cache_resolve_mp(nch->mount);
2365 _cache_put(nch->ncp);
fad57d0e 2366 if (ncvp_debug) {
6ea70f76 2367 kprintf("cache_fromdvp: resolve root of mount %p error %d",
fad57d0e
MD
2368 dvp->v_mount, error);
2369 }
2370 if (error) {
2371 if (ncvp_debug)
6ea70f76 2372 kprintf(" failed\n");
28623bf9 2373 nch->ncp = NULL;
fad57d0e
MD
2374 break;
2375 }
2376 if (ncvp_debug)
6ea70f76 2377 kprintf(" succeeded\n");
fad57d0e
MD
2378 continue;
2379 }
2380
a0d57516
MD
2381 /*
2382 * If we are recursed too deeply resort to an O(n^2)
2383 * algorithm to resolve the namecache topology. The
cc4c3b52 2384 * resolved pvp is left referenced in saved_dvp to
a0d57516
MD
2385 * prevent the tree from being destroyed while we loop.
2386 */
2387 if (makeit > 20) {
cc4c3b52 2388 error = cache_fromdvp_try(dvp, cred, &saved_dvp);
a0d57516 2389 if (error) {
6ea70f76 2390 kprintf("lookupdotdot(longpath) failed %d "
a0d57516 2391 "dvp %p\n", error, dvp);
1142bff7 2392 nch->ncp = NULL;
a0d57516
MD
2393 break;
2394 }
2395 continue;
2396 }
2397
fad57d0e
MD
2398 /*
2399 * Get the parent directory and resolve its ncp.
2400 */
33387738
MD
2401 if (fakename) {
2402 kfree(fakename, M_TEMP);
2403 fakename = NULL;
2404 }
2405 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2406 &fakename);
fad57d0e 2407 if (error) {
6ea70f76 2408 kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
fad57d0e
MD
2409 break;
2410 }
a11aaa81 2411 vn_unlock(pvp);
fad57d0e
MD
2412
2413 /*
1142bff7
MD
2414 * Reuse makeit as a recursion depth counter. On success
2415 * nch will be fully referenced.
fad57d0e 2416 */
28623bf9 2417 cache_fromdvp(pvp, cred, makeit + 1, nch);
fad57d0e 2418 vrele(pvp);
28623bf9 2419 if (nch->ncp == NULL)
fad57d0e
MD
2420 break;
2421
2422 /*
2423 * Do an inefficient scan of pvp (embodied by ncp) to look
2424 * for dvp. This will create a namecache record for dvp on
2425 * success. We loop up to recheck on success.
2426 *
2427 * ncp and dvp are both held but not locked.
2428 */
33387738 2429 error = cache_inefficient_scan(nch, cred, dvp, fakename);
fad57d0e 2430 if (error) {
6ea70f76 2431 kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
28623bf9 2432 pvp, nch->ncp->nc_name, dvp);
1142bff7
MD
2433 cache_drop(nch);
2434 /* nch was NULLed out, reload mount */
2435 nch->mount = dvp->v_mount;
fad57d0e
MD
2436 break;
2437 }
2438 if (ncvp_debug) {
6ea70f76 2439 kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
28623bf9 2440 pvp, nch->ncp->nc_name);
fad57d0e 2441 }
1142bff7
MD
2442 cache_drop(nch);
2443 /* nch was NULLed out, reload mount */
2444 nch->mount = dvp->v_mount;
fad57d0e 2445 }
28623bf9
MD
2446
2447 /*
f63911bf 2448 * If nch->ncp is non-NULL it will have been held already.
28623bf9 2449 */
f63911bf
MD
2450 if (fakename)
2451 kfree(fakename, M_TEMP);
cc4c3b52
MD
2452 if (saved_dvp)
2453 vrele(saved_dvp);
28623bf9
MD
2454 if (nch->ncp)
2455 return (0);
2456 return (EINVAL);
fad57d0e
MD
2457}
2458
a0d57516
MD
2459/*
2460 * Go up the chain of parent directories until we find something
2461 * we can resolve into the namecache. This is very inefficient.
2462 */
2463static
2464int
2465cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
cc4c3b52 2466 struct vnode **saved_dvp)
a0d57516 2467{
28623bf9 2468 struct nchandle nch;
a0d57516
MD
2469 struct vnode *pvp;
2470 int error;
2471 static time_t last_fromdvp_report;
33387738 2472 char *fakename;
a0d57516
MD
2473
2474 /*
2475 * Loop getting the parent directory vnode until we get something we
2476 * can resolve in the namecache.
2477 */
2478 vref(dvp);
28623bf9 2479 nch.mount = dvp->v_mount;
1142bff7 2480 nch.ncp = NULL;
33387738 2481 fakename = NULL;
28623bf9 2482
a0d57516 2483 for (;;) {
33387738
MD
2484 if (fakename) {
2485 kfree(fakename, M_TEMP);
2486 fakename = NULL;
2487 }
2488 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2489 &fakename);
a0d57516
MD
2490 if (error) {
2491 vrele(dvp);
33387738 2492 break;
a0d57516 2493 }
a11aaa81 2494 vn_unlock(pvp);
b8cd5817 2495 spin_lock_shared(&pvp->v_spin);
28623bf9
MD
2496 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2497 _cache_hold(nch.ncp);
b8cd5817 2498 spin_unlock_shared(&pvp->v_spin);
a0d57516
MD
2499 vrele(pvp);
2500 break;
2501 }
b8cd5817 2502 spin_unlock_shared(&pvp->v_spin);
a0d57516 2503 if (pvp->v_flag & VROOT) {
28623bf9
MD
2504 nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2505 error = cache_resolve_mp(nch.mount);
2506 _cache_unlock(nch.ncp);
a0d57516
MD
2507 vrele(pvp);
2508 if (error) {
28623bf9 2509 _cache_drop(nch.ncp);
1142bff7 2510 nch.ncp = NULL;
a0d57516 2511 vrele(dvp);
a0d57516
MD
2512 }
2513 break;
2514 }
2515 vrele(dvp);
2516 dvp = pvp;
2517 }
33387738 2518 if (error == 0) {
cec73927
MD
2519 if (last_fromdvp_report != time_uptime) {
2520 last_fromdvp_report = time_uptime;
33387738
MD
2521 kprintf("Warning: extremely inefficient path "
2522 "resolution on %s\n",
2523 nch.ncp->nc_name);
2524 }
2525 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
cc4c3b52 2526
33387738
MD
2527 /*
2528 * Hopefully dvp now has a namecache record associated with
2529 * it. Leave it referenced to prevent the kernel from
2530 * recycling the vnode. Otherwise extremely long directory
2531 * paths could result in endless recycling.
2532 */
2533 if (*saved_dvp)
2534 vrele(*saved_dvp);
2535 *saved_dvp = dvp;
1142bff7 2536 _cache_drop(nch.ncp);
33387738
MD
2537 }
2538 if (fakename)
2539 kfree(fakename, M_TEMP);
a0d57516
MD
2540 return (error);
2541}
2542
fad57d0e
MD
2543/*
2544 * Do an inefficient scan of the directory represented by ncp looking for
2545 * the directory vnode dvp. ncp must be held but not locked on entry and
2546 * will be held on return. dvp must be refd but not locked on entry and
2547 * will remain refd on return.
2548 *
2549 * Why do this at all? Well, due to its stateless nature the NFS server
2550 * converts file handles directly to vnodes without necessarily going through
2551 * the namecache ops that would otherwise create the namecache topology
2552 * leading to the vnode. We could either (1) Change the namecache algorithms
2553 * to allow disconnect namecache records that are re-merged opportunistically,
2554 * or (2) Make the NFS server backtrack and scan to recover a connected
2555 * namecache topology in order to then be able to issue new API lookups.
2556 *
2557 * It turns out that (1) is a huge mess. It takes a nice clean set of
2558 * namecache algorithms and introduces a lot of complication in every subsystem
2559 * that calls into the namecache to deal with the re-merge case, especially
2560 * since we are using the namecache to placehold negative lookups and the
2561 * vnode might not be immediately assigned. (2) is certainly far less
2562 * efficient then (1), but since we are only talking about directories here
2563 * (which are likely to remain cached), the case does not actually run all
2564 * that often and has the supreme advantage of not polluting the namecache
2565 * algorithms.
33387738
MD
2566 *
2567 * If a fakename is supplied just construct a namecache entry using the
2568 * fake name.
fad57d0e
MD
2569 */
2570static int
28623bf9 2571cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
33387738 2572 struct vnode *dvp, char *fakename)
fad57d0e
MD
2573{
2574 struct nlcomponent nlc;
28623bf9 2575 struct nchandle rncp;
fad57d0e
MD
2576 struct dirent *den;
2577 struct vnode *pvp;
2578 struct vattr vat;
2579 struct iovec iov;
2580 struct uio uio;
fad57d0e
MD
2581 int blksize;
2582 int eofflag;
4d22f42a 2583 int bytes;
fad57d0e
MD
2584 char *rbuf;
2585 int error;
fad57d0e
MD
2586
2587 vat.va_blocksize = 0;
87de5057 2588 if ((error = VOP_GETATTR(dvp, &vat)) != 0)
fad57d0e 2589 return (error);
2247fe02
MD
2590 cache_lock(nch);
2591 error = cache_vref(nch, cred, &pvp);
2592 cache_unlock(nch);
2593 if (error)
fad57d0e 2594 return (error);
973c11b9 2595 if (ncvp_debug) {
e16d1eb4 2596 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
973c11b9 2597 "vattr fileid = %lld\n",
e16d1eb4 2598 nch->ncp, nch->ncp->nc_name,
973c11b9
MD
2599 vat.va_blocksize,
2600 (long long)vat.va_fileid);
2601 }
33387738
MD
2602
2603 /*
2604 * Use the supplied fakename if not NULL. Fake names are typically
2605 * not in the actual filesystem hierarchy. This is used by HAMMER
2606 * to glue @@timestamp recursions together.
2607 */
2608 if (fakename) {
2609 nlc.nlc_nameptr = fakename;
2610 nlc.nlc_namelen = strlen(fakename);
2611 rncp = cache_nlookup(nch, &nlc);
2612 goto done;
2613 }
2614
fad57d0e
MD
2615 if ((blksize = vat.va_blocksize) == 0)
2616 blksize = DEV_BSIZE;
efda3bd0 2617 rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
28623bf9 2618 rncp.ncp = NULL;
fad57d0e
MD
2619
2620 eofflag = 0;
2621 uio.uio_offset = 0;
fad57d0e 2622again:
fad57d0e
MD
2623 iov.iov_base = rbuf;
2624 iov.iov_len = blksize;
2625 uio.uio_iov = &iov;
2626 uio.uio_iovcnt = 1;
2627 uio.uio_resid = blksize;
2628 uio.uio_segflg = UIO_SYSSPACE;
2629 uio.uio_rw = UIO_READ;
2630 uio.uio_td = curthread;
2631
fad57d0e 2632 if (ncvp_debug >= 2)
6ea70f76 2633 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
4d22f42a 2634 error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
fad57d0e 2635 if (error == 0) {
4d22f42a
MD
2636 den = (struct dirent *)rbuf;
2637 bytes = blksize - uio.uio_resid;
2638
2639 while (bytes > 0) {
2640 if (ncvp_debug >= 2) {
6ea70f76 2641 kprintf("cache_inefficient_scan: %*.*s\n",
4d22f42a
MD
2642 den->d_namlen, den->d_namlen,
2643 den->d_name);
2644 }
fad57d0e 2645 if (den->d_type != DT_WHT &&
01f31ab3 2646 den->d_ino == vat.va_fileid) {
4d22f42a 2647 if (ncvp_debug) {
6ea70f76 2648 kprintf("cache_inefficient_scan: "
50626622 2649 "MATCHED inode %lld path %s/%*.*s\n",
973c11b9
MD
2650 (long long)vat.va_fileid,
2651 nch->ncp->nc_name,
4d22f42a
MD
2652 den->d_namlen, den->d_namlen,
2653 den->d_name);
2654 }
fad57d0e
MD
2655 nlc.nlc_nameptr = den->d_name;
2656 nlc.nlc_namelen = den->d_namlen;
28623bf9
MD
2657 rncp = cache_nlookup(nch, &nlc);
2658 KKASSERT(rncp.ncp != NULL);
fad57d0e
MD
2659 break;
2660 }
01f31ab3
JS
2661 bytes -= _DIRENT_DIRSIZ(den);
2662 den = _DIRENT_NEXT(den);
fad57d0e 2663 }
28623bf9 2664 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
fad57d0e
MD
2665 goto again;
2666 }
33387738
MD
2667 kfree(rbuf, M_TEMP);
2668done:
885ecb13 2669 vrele(pvp);
28623bf9
MD
2670 if (rncp.ncp) {
2671 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
4b5bbb78 2672 _cache_setvp(rncp.mount, rncp.ncp, dvp);
fad57d0e 2673 if (ncvp_debug >= 2) {
6ea70f76 2674 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
28623bf9 2675 nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
fad57d0e
MD
2676 }
2677 } else {
2678 if (ncvp_debug >= 2) {
6ea70f76 2679 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
28623bf9
MD
2680 nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2681 rncp.ncp->nc_vp);
fad57d0e
MD
2682 }
2683 }
28623bf9
MD
2684 if (rncp.ncp->nc_vp == NULL)
2685 error = rncp.ncp->nc_error;
1142bff7
MD
2686 /*
2687 * Release rncp after a successful nlookup. rncp was fully
2688 * referenced.
2689 */
2690 cache_put(&rncp);
fad57d0e 2691 } else {
6ea70f76 2692 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
28623bf9 2693 dvp, nch->ncp->nc_name);
fad57d0e
MD
2694 error = ENOENT;
2695 }
fad57d0e
MD
2696 return (error);
2697}
2698
984263bc 2699/*
67773eb3 2700 * Zap a namecache entry. The ncp is unconditionally set to an unresolved
bf3f67a7 2701 * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list.
7ea21ed1 2702 *
67773eb3 2703 * Then, if there are no additional references to the ncp and no children,
f63911bf 2704 * the ncp is removed from the topology and destroyed.
7ea21ed1 2705 *
67773eb3
MD
2706 * References and/or children may exist if the ncp is in the middle of the
2707 * topology, preventing the ncp from being destroyed.
7ea21ed1 2708 *
67773eb3
MD
2709 * This function must be called with the ncp held and locked and will unlock
2710 * and drop it during zapping.
f63911bf 2711 *
65870584
MD
2712 * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2713 * This case can occur in the cache_drop() path.
2714 *
f63911bf
MD
2715 * This function may returned a held (but NOT locked) parent node which the
2716 * caller must drop. We do this so _cache_drop() can loop, to avoid
2717 * blowing out the kernel stack.
2718 *
2719 * WARNING! For MPSAFE operation this routine must acquire up to three
2720 * spin locks to be able to safely test nc_refs. Lock order is
2721 * very important.
2722 *
2723 * hash spinlock if on hash list
2724 * parent spinlock if child of parent
2725 * (the ncp is unresolved so there is no vnode association)
984263bc 2726 */
f63911bf 2727static struct namecache *
65870584 2728cache_zap(struct namecache *ncp, int nonblock)
984263bc 2729{
7ea21ed1 2730 struct namecache *par;
f63911bf 2731 struct vnode *dropvp;
f0181d63 2732 struct nchash_head *nchpp;
f63911bf 2733 int refs;
7ea21ed1
MD
2734
2735 /*
2736 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2737 */
28623bf9 2738 _cache_setunresolved(ncp);
7ea21ed1
MD
2739
2740 /*
2741 * Try to scrap the entry and possibly tail-recurse on its parent.
2742 * We only scrap unref'd (other then our ref) unresolved entries,
2743 * we do not scrap 'live' entries.
f63911bf
MD
2744 *
2745 * Note that once the spinlocks are acquired if nc_refs == 1 no
2746 * other references are possible. If it isn't, however, we have
2747 * to decrement but also be sure to avoid a 1->0 transition.
7ea21ed1 2748 */
f63911bf
MD
2749 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2750 KKASSERT(ncp->nc_refs > 0);
7ea21ed1 2751
f63911bf 2752 /*
65870584
MD
2753 * Acquire locks. Note that the parent can't go away while we hold
2754 * a child locked.
f63911bf 2755 */
f0181d63 2756 nchpp = NULL;
2247fe02 2757 if ((par = ncp->nc_parent) != NULL) {
65870584
MD
2758 if (nonblock) {
2759 for (;;) {
2760 if (_cache_lock_nonblock(par) == 0)
2761 break;
65870584
MD
2762 refs = ncp->nc_refs;
2763 ncp->nc_flag |= NCF_DEFEREDZAP;
2764 ++numdefered; /* MP race ok */
2765 if (atomic_cmpset_int(&ncp->nc_refs,
2766 refs, refs - 1)) {
2767 _cache_unlock(ncp);
2768 return(NULL);
2769 }
2770 cpu_pause();
2771 }
2772 _cache_hold(par);
2773 } else {
2774 _cache_hold(par);
2775 _cache_lock(par);
2776 }
f0181d63
MD
2777 nchpp = ncp->nc_head;
2778 spin_lock(&nchpp->spin);
f63911bf 2779 }
7ea21ed1 2780
f63911bf 2781 /*
f0181d63
MD
2782 * At this point if we find refs == 1 it should not be possible for
2783 * anyone else to have access to the ncp. We are holding the only
2784 * possible access point left (nchpp) spin-locked.
2785 *
f63911bf
MD
2786 * If someone other then us has a ref or we have children
2787 * we cannot zap the entry. The 1->0 transition and any
2788 * further list operation is protected by the spinlocks
2789 * we have acquired but other transitions are not.
2790 */
2791 for (;;) {
2792 refs = ncp->nc_refs;
fda4c5f3 2793 cpu_ccfence();
f63911bf
MD
2794 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
2795 break;
2796 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
2247fe02 2797 if (par) {
f0181d63 2798 spin_unlock(&nchpp->spin);
2247fe02
MD
2799 _cache_put(par);
2800 }
f63911bf
MD
2801 _cache_unlock(ncp);
2802 return(NULL);
7ea21ed1 2803 }
2247fe02 2804 cpu_pause();
f63911bf 2805 }
67773eb3 2806
f63911bf
MD
2807 /*
2808 * We are the only ref and with the spinlocks held no further
2809 * refs can be acquired by others.
2810 *
2811 * Remove us from the hash list and parent list. We have to
2812 * drop a ref on the parent's vp if the parent's list becomes
2813 * empty.
2814 */
f63911bf 2815 dropvp = NULL;
2247fe02 2816 if (par) {
bf3f67a7
MD
2817 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
2818
f0181d63 2819 KKASSERT(nchpp == ncp->nc_head);
2247fe02
MD
2820 LIST_REMOVE(ncp, nc_hash);
2821 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
bf3f67a7
MD
2822 atomic_add_long(&pn->vfscache_count, -1);
2823 if (TAILQ_EMPTY(&ncp->nc_list))
2824 atomic_add_long(&pn->vfscache_leafs, -1);
2825
2826 if (TAILQ_EMPTY(&par->nc_list)) {
2827 atomic_add_long(&pn->vfscache_leafs, 1);
2828 if (par->nc_vp)
2829 dropvp = par->nc_vp;
2830 }
2247fe02
MD
2831 ncp->nc_head = NULL;
2832 ncp->nc_parent = NULL;
287a8577 2833 spin_unlock(&nchpp->spin);
2247fe02
MD
2834 _cache_unlock(par);
2835 } else {
2836 KKASSERT(ncp->nc_head == NULL);
7ea21ed1 2837 }
f63911bf
MD
2838
2839 /*
2840 * ncp should not have picked up any refs. Physically
2841 * destroy the ncp.
2842 */
fda4c5f3
MD
2843 if (ncp->nc_refs != 1) {
2844 int save_refs = ncp->nc_refs;
2845 cpu_ccfence();
2846 panic("cache_zap: %p bad refs %d (%d)\n",
2847 ncp, save_refs, atomic_fetchadd_int(&ncp->nc_refs, 0));
2848 }
f63911bf 2849 KKASSERT(ncp->nc_refs == 1);
f63911bf
MD
2850 /* _cache_unlock(ncp) not required */
2851 ncp->nc_refs = -1; /* safety */
2852 if (ncp->nc_name)
2853 kfree(ncp->nc_name, M_VFSCACHE);
2854 kfree(ncp, M_VFSCACHE);
2855
2856 /*
2857 * Delayed drop (we had to release our spinlocks)
2858 *
2859 * The refed parent (if not NULL) must be dropped. The
2860 * caller is responsible for looping.
2861 */
2862 if (dropvp)
2863 vdrop(dropvp);
2864 return(par);
984263bc
MD
2865}
2866
65870584
MD
2867/*
2868 * Clean up dangling negative cache and defered-drop entries in the
2869 * namecache.
6363f268
MD
2870 *
2871 * This routine is called in the critical path and also called from
2872 * vnlru(). When called from vnlru we use a lower limit to try to
2873 * deal with the negative cache before the critical path has to start
2874 * dealing with it.
65870584 2875 */
9e10d70b
MD
2876typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2877
6363f268
MD
2878static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2879static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
62d0f1f0 2880
62d0f1f0 2881void
6363f268 2882cache_hysteresis(int critpath)
62d0f1f0 2883{
bf3f67a7
MD
2884 long poslimit;
2885 long neglimit = maxvnodes / ncnegfactor;
2886 long xnumcache = vfscache_leafs;
6363f268
MD
2887
2888 if (critpath == 0)
2889 neglimit = neglimit * 8 / 10;
9e10d70b 2890
62d0f1f0
MD
2891 /*
2892 * Don't cache too many negative hits. We use hysteresis to reduce
2893 * the impact on the critical path.
2894 */
6363f268 2895 switch(neg_cache_hysteresis_state[critpath]) {
62d0f1f0 2896 case CHI_LOW:
bf3f67a7 2897 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) {
6363f268
MD
2898 if (critpath)
2899 _cache_cleanneg(ncnegflush);
2900 else
2901 _cache_cleanneg(ncnegflush +
bf3f67a7 2902 vfscache_negs - neglimit);
6363f268 2903 neg_cache_hysteresis_state[critpath] = CHI_HIGH;
62d0f1f0
MD
2904 }
2905 break;
2906 case CHI_HIGH:
bf3f67a7
MD
2907 if (vfscache_negs > MINNEG * 9 / 10 &&
2908 vfscache_negs * 9 / 10 > neglimit
62d0f1f0 2909 ) {
6363f268
MD
2910 if (critpath)
2911 _cache_cleanneg(ncnegflush);
2912 else
2913 _cache_cleanneg(ncnegflush +
bf3f67a7
MD
2914 vfscache_negs * 9 / 10 -
2915 neglimit);
62d0f1f0 2916 } else {
6363f268 2917 neg_cache_hysteresis_state[critpath] = CHI_LOW;
9e10d70b
MD
2918 }
2919 break;
2920 }
2921
2922 /*
2923 * Don't cache too many positive hits. We use hysteresis to reduce
2924 * the impact on the critical path.
2925 *
2926 * Excessive positive hits can accumulate due to large numbers of
2927 * hardlinks (the vnode cache will not prevent hl ncps from growing
2928 * into infinity).
2929 */
2930 if ((poslimit = ncposlimit) == 0)
9629eb35 2931 poslimit = maxvnodes * 2;
6363f268
MD
2932 if (critpath == 0)
2933 poslimit = poslimit * 8 / 10;
9e10d70b 2934
6363f268 2935 switch(pos_cache_hysteresis_state[critpath]) {
9e10d70b 2936 case CHI_LOW:
6363f268
MD
2937 if (xnumcache > poslimit && xnumcache > MINPOS) {
2938 if (critpath)
2939 _cache_cleanpos(ncposflush);
2940 else
2941 _cache_cleanpos(ncposflush +
2942 xnumcache - poslimit);
2943 pos_cache_hysteresis_state[critpath] = CHI_HIGH;
9e10d70b
MD
2944 }
2945 break;
2946 case CHI_HIGH:
6363f268
MD
2947 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
2948 if (critpath)
2949 _cache_cleanpos(ncposflush);
2950 else
2951 _cache_cleanpos(ncposflush +
2952 xnumcache - poslimit * 5 / 6);
9e10d70b 2953 } else {
6363f268 2954 pos_cache_hysteresis_state[critpath] = CHI_LOW;
62d0f1f0
MD
2955 }
2956 break;
2957 }
65870584
MD
2958
2959 /*
2960 * Clean out dangling defered-zap ncps which could not
2961 * be cleanly dropped if too many build up. Note
2962 * that numdefered is not an exact number as such ncps
2963 * can be reused and the counter is not handled in a MP
2964 * safe manner by design.
2965 */
6363f268 2966 if (numdefered > neglimit) {
65870584
MD
2967 _cache_cleandefered();
2968 }
62d0f1f0
MD
2969}
2970
14c92d03
MD
2971/*
2972 * NEW NAMECACHE LOOKUP API
2973 *
2247fe02
MD
2974 * Lookup an entry in the namecache. The passed par_nch must be referenced
2975 * and unlocked. A referenced and locked nchandle with a non-NULL nch.ncp
2976 * is ALWAYS returned, eve if the supplied component is illegal.
2977 *
fad57d0e 2978 * The resulting namecache entry should be returned to the system with
2247fe02 2979 * cache_put() or cache_unlock() + cache_drop().
14c92d03
MD
2980 *
2981 * namecache locks are recursive but care must be taken to avoid lock order
2247fe02
MD
2982 * reversals (hence why the passed par_nch must be unlocked). Locking
2983 * rules are to order for parent traversals, not for child traversals.
14c92d03
MD
2984 *
2985 * Nobody else will be able to manipulate the associated namespace (e.g.
2986 * create, delete, rename, rename-target) until the caller unlocks the
2987 * entry.
2988 *
2989 * The returned entry will be in one of three states: positive hit (non-null
2990 * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2991 * Unresolved entries must be resolved through the filesystem to associate the
2992 * vnode and/or determine whether a positive or negative hit has occured.
2993 *
2994 * It is not necessary to lock a directory in order to lock namespace under
2995 * that directory. In fact, it is explicitly not allowed to do that. A
2996 * directory is typically only locked when being created, renamed, or
2997 * destroyed.
2998 *
2999 * The directory (par) may be unresolved, in which case any returned child
3000 * will likely also be marked unresolved. Likely but not guarenteed. Since
fad57d0e
MD
3001 * the filesystem lookup requires a resolved directory vnode the caller is
3002 * responsible for resolving the namecache chain top-down. This API
14c92d03
MD
3003 * specifically allows whole chains to be created in an unresolved state.
3004 */
28623bf9
MD
3005struct nchandle
3006cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
14c92d03 3007{
28623bf9 3008 struct nchandle nch;
690a3127
MD
3009 struct namecache *ncp;
3010 struct namecache *new_ncp;
f63911bf 3011 struct nchash_head *nchpp;
4b5bbb78 3012 struct mount *mp;
690a3127
MD
3013 u_int32_t hash;
3014 globaldata_t gd;
2247fe02 3015 int par_locked;
690a3127 3016
690a3127 3017 gd = mycpu;
4b5bbb78 3018 mp = par_nch->mount;
2247fe02
MD
3019 par_locked = 0;
3020
3021 /*
3022 * This is a good time to call it, no ncp's are locked by
3023 * the caller or us.
3024 */
6363f268 3025 cache_hysteresis(1);
690a3127 3026
690a3127
MD
3027 /*
3028 * Try to locate an existing entry
3029 */
3030 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
28623bf9 3031 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
690a3127 3032 new_ncp = NULL;
f63911bf 3033 nchpp = NCHHASH(hash);
690a3127 3034restart:
b8cd5817
MD
3035 if (new_ncp)
3036 spin_lock(&nchpp->spin);
3037 else
3038 spin_lock_shared(&nchpp->spin);
3039
f63911bf 3040 LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
690a3127
MD
3041 /*
3042 * Break out if we find a matching entry. Note that
e09206ba
MD
3043 * UNRESOLVED entries may match, but DESTROYED entries
3044 * do not.
690a3127 3045 */
28623bf9 3046 if (ncp->nc_parent == par_nch->ncp &&
690a3127 3047 ncp->nc_nlen == nlc->nlc_namelen &&
e09206ba
MD
3048 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3049 (ncp->nc_flag & NCF_DESTROYED) == 0
690a3127 3050 ) {
f63911bf 3051 _cache_hold(ncp);
b8cd5817
MD
3052 if (new_ncp)
3053 spin_unlock(&nchpp->spin);
3054 else
3055 spin_unlock_shared(&nchpp->spin);
2247fe02
MD
3056 if (par_locked) {
3057 _cache_unlock(par_nch->ncp);
3058 par_locked = 0;
3059 }
3060 if (_cache_lock_special(ncp) == 0) {
41b6212d
MD
3061 /*
3062 * Successfully locked but we must re-test
3063 * conditions that might have changed since
3064 * we did not have the lock before.
3065 */
fda4c5f3
MD
3066 if (ncp->nc_parent != par_nch->ncp ||
3067 ncp->nc_nlen != nlc->nlc_namelen ||
3068 bcmp(ncp->nc_name, nlc->nlc_nameptr,
3069 ncp->nc_nlen) ||
3070 (ncp->nc_flag & NCF_DESTROYED)) {
41b6212d
MD
3071 _cache_put(ncp);
3072 goto restart;
3073 }
4b5bbb78 3074 _cache_auto_unresolve(mp, ncp);
67773eb3 3075 if (new_ncp)
28623bf9 3076 _cache_free(new_ncp);
67773eb3
MD
3077 goto found;
3078 }
41b6212d 3079 _cache_get(ncp); /* cycle the lock to block */
28623bf9 3080 _cache_put(ncp);
f63911bf 3081 _cache_drop(ncp);
67773eb3 3082 goto restart;
690a3127
MD
3083 }
3084 }
3085
3086 /*
3087 * We failed to locate an entry, create a new entry and add it to
2247fe02
MD
3088 * the cache. The parent ncp must also be locked so we
3089 * can link into it.
3090 *
3091 * We have to relookup after possibly blocking in kmalloc or
3092 * when locking par_nch.
3093 *
3094 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3095 * mount case, in which case nc_name will be NULL.
690a3127
MD
3096 */
3097 if (new_ncp == NULL) {
b8cd5817 3098 spin_unlock_shared(&nchpp->spin);
524c845c 3099 new_ncp = cache_alloc(nlc->nlc_namelen);
2247fe02
MD
3100 if (nlc->nlc_namelen) {
3101 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3102 nlc->nlc_namelen);
3103 new_ncp->nc_name[nlc->nlc_namelen] = 0;
3104 }
3105 goto restart;
3106 }
b8cd5817
MD
3107
3108 /*
3109 * NOTE! The spinlock is held exclusively here because new_ncp
3110 * is non-NULL.
3111 */
2247fe02 3112 if (par_locked == 0) {
287a8577 3113 spin_unlock(&nchpp->spin);
2247fe02
MD
3114 _cache_lock(par_nch->ncp);
3115 par_locked = 1;
690a3127
MD
3116 goto restart;
3117 }
690a3127
MD
3118
3119 /*
2247fe02 3120 * WARNING! We still hold the spinlock. We have to set the hash
668b43c5 3121 * table entry atomically.
690a3127 3122 */
2247fe02
MD
3123 ncp = new_ncp;
3124 _cache_link_parent(ncp, par_nch->ncp, nchpp);
287a8577 3125 spin_unlock(&nchpp->spin);
2247fe02
MD
3126 _cache_unlock(par_nch->ncp);
3127 /* par_locked = 0 - not used */
690a3127 3128found:
fad57d0e
MD
3129 /*
3130 * stats and namecache size management
3131 */
3132 if (ncp->nc_flag & NCF_UNRESOLVED)
3133 ++gd->gd_nchstats->ncs_miss;
3134 else if (ncp->nc_vp)
3135 ++gd->gd_nchstats->ncs_goodhits;
3136 else
3137 ++gd->gd_nchstats->ncs_neghits;
4b5bbb78 3138 nch.mount = mp;
28623bf9 3139 nch.ncp = ncp;
3536c341
MD
3140 _cache_mntref(nch.mount);
3141
28623bf9 3142 return(nch);
690a3127
MD
3143}
3144
79fd1696
MD
3145/*
3146 * Attempt to lookup a namecache entry and return with a shared namecache
3147 * lock.
3148 */
3149int
3150cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc,
3151 int excl, struct nchandle *res_nch)
3152{
3153 struct namecache *ncp;
3154 struct nchash_head *nchpp;
3155 struct mount *mp;
3156 u_int32_t hash;
3157 globaldata_t gd;
3158
3159 /*
3160 * If exclusive requested or shared namecache locks are disabled,
3161 * return failure.
3162 */
3163 if (ncp_shared_lock_disable || excl)
3164 return(EWOULDBLOCK);
3165
79fd1696
MD
3166 gd = mycpu;
3167 mp = par_nch->mount;
3168
3169 /*
3170 * This is a good time to call it, no ncp's are locked by
3171 * the caller or us.
3172 */
6363f268 3173 cache_hysteresis(1);
79fd1696
MD
3174
3175 /*
3176 * Try to locate an existing entry
3177 */
3178 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3179 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3180 nchpp = NCHHASH(hash);
3181
b8cd5817 3182 spin_lock_shared(&nchpp->spin);
79fd1696
MD
3183
3184 LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
79fd1696
MD
3185 /*
3186 * Break out if we find a matching entry. Note that
3187 * UNRESOLVED entries may match, but DESTROYED entries
3188 * do not.
3189 */
3190 if (ncp->nc_parent == par_nch->ncp &&
3191 ncp->nc_nlen == nlc->nlc_namelen &&
3192 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3193 (ncp->nc_flag & NCF_DESTROYED) == 0
3194 ) {
3195 _cache_hold(ncp);
b8cd5817 3196 spin_unlock_shared(&nchpp->spin);
79fd1696 3197 if (_cache_lock_shared_special(ncp) == 0) {
fda4c5f3
MD
3198 if (ncp->nc_parent == par_nch->ncp &&
3199 ncp->nc_nlen == nlc->nlc_namelen &&
3200 bcmp(ncp->nc_name, nlc->nlc_nameptr,
3201 ncp->nc_nlen) == 0 &&
79fd1696 3202 (ncp->nc_flag & NCF_DESTROYED) == 0 &&
fda4c5f3 3203 (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
79fd1696
MD
3204 _cache_auto_unresolve_test(mp, ncp) == 0) {
3205 goto found;
3206 }
3207 _cache_unlock(ncp);
3208 }
3209 _cache_drop(ncp);
b8cd5817 3210 spin_lock_shared(&nchpp->spin);
79fd1696
MD
3211 break;
3212 }
3213 }
3214
3215 /*
3216 * Failure
3217 */
b8cd5817 3218 spin_unlock_shared(&nchpp->spin);
79fd1696
MD
3219 return(EWOULDBLOCK);
3220
3221 /*
3222 * Success
3223 *
3224 * Note that nc_error might be non-zero (e.g ENOENT).
3225 */
3226found:
3227 res_nch->mount = mp;
3228 res_nch->ncp = ncp;
3229 ++gd->gd_nchstats->ncs_goodhits;
3536c341 3230 _cache_mntref(res_nch->mount);
79fd1696
MD
3231
3232 KKASSERT(ncp->nc_error != EWOULDBLOCK);
3233 return(ncp->nc_error);
3234}
3235
668b43c5
MD
3236/*
3237 * This is a non-blocking verison of cache_nlookup() used by
3238 * nfs_readdirplusrpc_uio(). It can fail for any reason and
3239 * will return nch.ncp == NULL in that case.
3240 */
3241struct nchandle
3242cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
3243{
3244 struct nchandle nch;
3245 struct namecache *ncp;
3246 struct namecache *new_ncp;
3247 struct nchash_head *nchpp;
3248 struct mount *mp;
3249 u_int32_t hash;
3250 globaldata_t gd;
3251 int par_locked;
3252
668b43c5
MD
3253 gd = mycpu;
3254 mp = par_nch->mount;
3255 par_locked = 0;
3256
3257 /*
3258 * Try to locate an existing entry
3259 */
3260 hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3261 hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3262 new_ncp = NULL;
3263 nchpp = NCHHASH(hash);
3264restart:
287a8577 3265 spin_lock(&nchpp->spin);
668b43c5 3266 LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
668b43c5
MD
3267 /*
3268 * Break out if we find a matching entry. Note that
3269 * UNRESOLVED entries may match, but DESTROYED entries
3270 * do not.
3271 */
3272 if (ncp->nc_parent == par_nch->ncp &&
3273 ncp->nc_nlen == nlc->nlc_namelen &&
3274 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3275 (ncp->nc_flag & NCF_DESTROYED) == 0
3276 ) {
3277 _cache_hold(ncp);
287a8577 3278 spin_unlock(&nchpp->spin);
668b43c5
MD
3279 if (par_locked) {
3280 _cache_unlock(par_nch->ncp);
3281 par_locked = 0;
3282 }
3283 if (_cache_lock_special(ncp) == 0) {
f0181d63
MD
3284 if (ncp->nc_parent != par_nch->ncp ||
3285 ncp->nc_nlen != nlc->nlc_namelen ||
3286 bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) ||
3287 (ncp->nc_flag & NCF_DESTROYED)) {
3288 kprintf("cache_lookup_nonblock: "
3289 "ncp-race %p %*.*s\n",
3290 ncp,
3291 nlc->nlc_namelen,
3292 nlc->nlc_namelen,
3293 nlc->nlc_nameptr);
3294 _cache_unlock(ncp);
3295 _cache_drop(ncp);
3296 goto failed;
3297 }
668b43c5
MD
3298 _cache_auto_unresolve(mp, ncp);
3299 if (new_ncp) {
3300 _cache_free(new_ncp);
3301 new_ncp = NULL;
3302 }
3303 goto found;
3304 }
3305 _cache_drop(ncp);
3306 goto failed;
3307 }
3308 }
3309
3310 /*
3311 * We failed to locate an entry, create a new entry and add it to
3312 * the cache. The parent ncp must also be locked so we
3313 * can link into it.
3314 *
3315 * We have to relookup after possibly blocking in kmalloc or
3316 * when locking par_nch.
3317 *
3318 * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3319 * mount case, in which case nc_name will be NULL.
3320 */
3321 if (new_ncp == NULL) {
287a8577 3322 spin_unlock(&nchpp->spin);
668b43c5
MD
3323 new_ncp = cache_alloc(nlc->nlc_namelen);
3324 if (nlc->nlc_namelen) {
3325 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3326 nlc->nlc_namelen);
3327 new_ncp->nc_name[nlc->nlc_namelen] = 0;
3328 }
3329 goto restart;
3330 }
3331 if (par_locked == 0) {
287a8577 3332 spin_unlock(&nchpp->spin);
668b43c5
MD
3333 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3334 par_locked = 1;
3335 goto restart;
3336 }
3337 goto failed;
3338 }
3339
3340 /*
3341 * WARNING! We still hold the spinlock. We have to set the hash
3342 * table entry atomically.
3343 */
3344 ncp = new_ncp;
3345 _cache_link_parent(ncp, par_nch->ncp, nchpp);
287a8577 3346 spin_unlock(&nchpp->spin);
668b43c5
MD
3347 _cache_unlock(par_nch->ncp);
3348 /* par_locked = 0 - not used */
3349found:
3350 /*
3351 * stats and namecache size management
3352 */
3353 if (ncp->nc_flag & NCF_UNRESOLVED)
3354 ++gd->gd_nchstats->ncs_miss;
3355 else if (ncp->nc_vp)
3356 ++gd->gd_nchstats->ncs_goodhits;
3357 else
3358 ++gd->gd_nchstats->ncs_neghits;
3359 nch.mount = mp;
3360 nch.ncp = ncp;
3536c341
MD
3361 _cache_mntref(nch.mount);
3362
668b43c5
MD
3363 return(nch);
3364failed:
3365 if (new_ncp) {
3366 _cache_free(new_ncp);
3367 new_ncp = NULL;
3368 }
3369 nch.mount = NULL;
3370 nch.ncp = NULL;
3371 return(nch);
3372}
3373
1d505369 3374/*
28623bf9 3375 * The namecache entry is marked as being used as a mount point.
07baed26
MD
3376 * Locate the mount if it is visible to the caller. The DragonFly
3377 * mount system allows arbitrary loops in the topology and disentangles
3378 * those loops by matching against (mp, ncp) rather than just (ncp).
3379 * This means any given ncp can dive any number of mounts, depending
3380 * on the relative mount (e.g. nullfs) the caller is at in the topology.
3381 *
3382 * We use a very simple frontend cache to reduce SMP conflicts,
3383 * which we have to do because the mountlist scan needs an exclusive
3384 * lock around its ripout info list. Not to mention that there might
3385 * be a lot of mounts.
1d505369 3386 */
28623bf9
MD
3387struct findmount_info {
3388 struct mount *result;
3389 struct mount *nch_mount;
3390 struct namecache *nch_ncp;
3391};
3392
a458ee25
MD
3393#define MNTCACHE_PRIME 66555444443333333ULL
3394
07baed26
MD
3395static
3396struct ncmount_cache *
3397ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3398{
a458ee25
MD
3399 uintptr_t hash;
3400
3401 hash = (uintptr_t)mp + ((uintptr_t)mp >> 18);
3402 hash %= MNTCACHE_PRIME;
3403 hash ^= (uintptr_t)ncp + ((uintptr_t)ncp >> 18);
3404 hash %= MNTCACHE_PRIME;
3405 hash = hash % NCMOUNT_NUMCACHE;
07baed26 3406
07baed26
MD
3407 return (&ncmount_cache[hash]);
3408}
3409
28623bf9
MD
3410static
3411int
3412cache_findmount_callback(struct mount *mp, void *data)
1d505369 3413{
28623bf9 3414 struct findmount_info *info = data;
1d505369 3415
28623bf9
MD
3416 /*
3417 * Check the mount's mounted-on point against the passed nch.
3418 */
3419 if (mp->mnt_ncmounton.mount == info->nch_mount &&
3420 mp->mnt_ncmounton.ncp == info->nch_ncp
3421 ) {
3422 info->result = mp;
3536c341 3423 _cache_mntref(mp);
28623bf9 3424 return(-1);
1d505369 3425 }
28623bf9 3426 return(0);
1d505369
MD
3427}
3428
28623bf9
MD
3429struct mount *
3430cache_findmount(struct nchandle *nch)
9b1b3591 3431{
28623bf9 3432 struct findmount_info info;
07baed26
MD
3433 struct ncmount_cache *ncc;
3434 struct mount *mp;
3435
3436 /*
3437 * Fast
3438 */
7e9c94bd
MD
3439 if (ncmount_cache_enable == 0) {
3440 ncc = NULL;
3441 goto skip;
3442 }
07baed26
MD
3443 ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3444 if (ncc->ncp == nch->ncp) {
3445 spin_lock_shared(&ncc->spin);
9c105d5b
MD
3446 if (ncc->isneg == 0 &&
3447 ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) {
07baed26
MD
3448 if (mp->mnt_ncmounton.mount == nch->mount &&
3449 mp->mnt_ncmounton.ncp == nch->ncp) {
9c105d5b
MD
3450 /*
3451 * Cache hit (positive)
3452 */
3536c341 3453 _cache_mntref(mp);
07baed26
MD
3454 spin_unlock_shared(&ncc->spin);
3455 return(mp);
3456 }
9c105d5b
MD
3457 /* else cache miss */
3458 }
3459 if (ncc->isneg &&
3460 ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3461 /*
3462 * Cache hit (negative)
3463 */
3464 spin_unlock_shared(&ncc->spin);
9c105d5b 3465 return(NULL);
07baed26
MD
3466 }
3467 spin_unlock_shared(&ncc->spin);
3468 }
7e9c94bd 3469skip:
28623bf9 3470
07baed26
MD
3471 /*
3472 * Slow
3473 */
28623bf9
MD
3474 info.result = NULL;
3475 info.nch_mount = nch->mount;
3476 info.nch_ncp = nch->ncp;
3477 mountlist_scan(cache_findmount_callback, &info,
fb578eac 3478 MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
07baed26 3479
9c105d5b
MD
3480 /*
3481 * Cache the result.
3482 *
3483 * Negative lookups: We cache the originating {ncp,mp}. (mp) is
3484 * only used for pointer comparisons and is not
3485 * referenced (otherwise there would be dangling
3486 * refs).
3487 *
3488 * Positive lookups: We cache the originating {ncp} and the target
3489 * (mp). (mp) is referenced.
3490 *
3491 * Indeterminant: If the match is undergoing an unmount we do
3492 * not cache it to avoid racing cache_unmounting(),
3493 * but still return the match.
3494 */
3495 if (ncc) {
07baed26 3496 spin_lock(&ncc->spin);
9c105d5b
MD
3497 if (info.result == NULL) {
3498 if (ncc->isneg == 0 && ncc->mp)
3536c341 3499 _cache_mntrel(ncc->mp);
9c105d5b
MD
3500 ncc->ncp = nch->ncp;
3501 ncc->mp = nch->mount;
3502 ncc->isneg = 1;
3503 spin_unlock(&ncc->spin);
9c105d5b
MD
3504 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) {
3505 if (ncc->isneg == 0 && ncc->mp)
3536c341
MD
3506 _cache_mntrel(ncc->mp);
3507 _cache_mntref(info.result);
07baed26 3508 ncc->ncp = nch->ncp;
9c105d5b
MD
3509 ncc->mp = info.result;
3510 ncc->isneg = 0;
07baed26
MD
3511 spin_unlock(&ncc->spin);
3512 } else {
3513 spin_unlock(&ncc->spin);
3514 }
3515 }