2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * External lock/ref-related vnode functions
38 * vs_state transition locking requirements:
40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin
41 * DYING -> CACHED vx_lock(excl)
42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin
43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin
44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin
46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
48 * Switching into ACTIVE also requires a vref and vnode lock, however
49 * the vnode lock is allowed to be SHARED.
51 * Switching into a CACHED or DYING state requires an exclusive vnode
52 * lock or vx_lock (which is almost the same thing).
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
61 #include <sys/vnode.h>
63 #include <sys/sysctl.h>
65 #include <machine/limits.h>
68 #include <vm/vm_object.h>
75 static void vnode_terminate(struct vnode *vp);
77 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
80 * The vnode free list hold inactive vnodes. Aged inactive vnodes
81 * are inserted prior to the mid point, and otherwise inserted
84 * The vnode code goes to great lengths to avoid moving vnodes between
85 * lists, but sometimes it is unavoidable. For this situation we try to
86 * avoid lock contention but we do not try very hard to avoid cache line
87 * congestion. A modestly sized hash table is used.
89 #define VLIST_PRIME2 123462047LU
90 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU
92 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \
93 VLIST_PRIME2 % (unsigned)ncpus)
95 static struct vnode_index *vnode_list_hash;
98 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
99 &activevnodes, 0, "Number of active nodes");
100 int cachedvnodes = 0;
101 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
102 &cachedvnodes, 0, "Number of total cached nodes");
103 int inactivevnodes = 0;
104 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
105 &inactivevnodes, 0, "Number of inactive nodes");
106 static int batchfreevnodes = 5;
107 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
108 &batchfreevnodes, 0, "Number of vnodes to free at once");
110 static u_long trackvnode;
111 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
116 * Called from vfsinit()
123 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */
124 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
125 M_VNODE, M_ZERO | M_WAITOK);
126 for (i = 0; i < ncpus; ++i) {
127 struct vnode_index *vi = &vnode_list_hash[i];
129 TAILQ_INIT(&vi->inactive_list);
130 TAILQ_INIT(&vi->active_list);
131 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
132 spin_init(&vi->spin, "vfslock");
141 _vsetflags(struct vnode *vp, int flags)
143 atomic_set_int(&vp->v_flag, flags);
148 _vclrflags(struct vnode *vp, int flags)
150 atomic_clear_int(&vp->v_flag, flags);
154 vsetflags(struct vnode *vp, int flags)
156 _vsetflags(vp, flags);
160 vclrflags(struct vnode *vp, int flags)
162 _vclrflags(vp, flags);
166 * Place the vnode on the active list.
168 * Caller must hold vp->v_spin
172 _vactivate(struct vnode *vp)
174 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
177 if ((u_long)vp == trackvnode)
178 kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
180 spin_lock(&vi->spin);
182 switch(vp->v_state) {
184 spin_unlock(&vi->spin);
185 panic("_vactivate: already active");
189 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
190 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
196 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
197 vp->v_state = VS_ACTIVE;
198 spin_unlock(&vi->spin);
199 atomic_add_int(&mycpu->gd_activevnodes, 1);
203 * Put a vnode on the inactive list.
205 * Caller must hold v_spin
209 _vinactive(struct vnode *vp)
211 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
214 if ((u_long)vp == trackvnode) {
215 kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
219 spin_lock(&vi->spin);
222 * Remove from active list if it is sitting on it
224 switch(vp->v_state) {
226 TAILQ_REMOVE(&vi->active_list, vp, v_list);
227 atomic_add_int(&mycpu->gd_activevnodes, -1);
230 spin_unlock(&vi->spin);
231 panic("_vinactive: already inactive");
240 * Distinguish between basically dead vnodes, vnodes with cached
241 * data, and vnodes without cached data. A rover will shift the
242 * vnodes around as their cache status is lost.
244 if (vp->v_flag & VRECLAIMED) {
245 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
247 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
249 vp->v_state = VS_INACTIVE;
250 spin_unlock(&vi->spin);
251 atomic_add_int(&mycpu->gd_inactivevnodes, 1);
256 _vinactive_tail(struct vnode *vp)
258 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
260 spin_lock(&vi->spin);
263 * Remove from active list if it is sitting on it
265 switch(vp->v_state) {
267 TAILQ_REMOVE(&vi->active_list, vp, v_list);
268 atomic_add_int(&mycpu->gd_activevnodes, -1);
271 spin_unlock(&vi->spin);
272 panic("_vinactive_tail: already inactive");
280 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
281 vp->v_state = VS_INACTIVE;
282 spin_unlock(&vi->spin);
283 atomic_add_int(&mycpu->gd_inactivevnodes, 1);
287 * Add a ref to an active vnode. This function should never be called
288 * with an inactive vnode (use vget() instead), but might be called
292 vref(struct vnode *vp)
294 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
295 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
296 atomic_add_int(&vp->v_refcnt, 1);
300 synchronizevnodecount(void)
307 for (i = 0; i < ncpus; ++i) {
308 globaldata_t gd = globaldata_find(i);
309 nca += gd->gd_cachedvnodes;
310 act += gd->gd_activevnodes;
311 ina += gd->gd_inactivevnodes;
315 inactivevnodes = ina;
319 * Count number of cached vnodes. This is middling expensive so be
320 * careful not to make this call in the critical path. Each cpu tracks
321 * its own accumulator. The individual accumulators must be summed
322 * together to get an accurate value.
325 countcachedvnodes(void)
330 for (i = 0; i < ncpus; ++i) {
331 globaldata_t gd = globaldata_find(i);
332 n += gd->gd_cachedvnodes;
338 countcachedandinactivevnodes(void)
343 for (i = 0; i < ncpus; ++i) {
344 globaldata_t gd = globaldata_find(i);
345 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
351 * Release a ref on an active or inactive vnode.
353 * Caller has no other requirements.
355 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
356 * transition, otherwise we leave the vnode in the active list and
357 * do a lockless transition to 0, which is very important for the
360 * (vrele() is not called when a vnode is being destroyed w/kfree)
363 vrele(struct vnode *vp)
366 int count = vp->v_refcnt;
368 KKASSERT((count & VREF_MASK) > 0);
369 KKASSERT(vp->v_state == VS_ACTIVE ||
370 vp->v_state == VS_INACTIVE);
375 if ((count & VREF_MASK) > 1) {
376 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
382 * 1->0 transition case must handle possible finalization.
383 * When finalizing we transition 1->0x40000000. Note that
384 * cachedvnodes is only adjusted on transitions to ->0.
386 * WARNING! VREF_TERMINATE can be cleared at any point
387 * when the refcnt is non-zero (by vget()) and
388 * the vnode has not been reclaimed. Thus
389 * transitions out of VREF_TERMINATE do not have
390 * to mess with cachedvnodes.
392 if (count & VREF_FINALIZE) {
394 if (atomic_cmpset_int(&vp->v_refcnt,
395 count, VREF_TERMINATE)) {
401 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
402 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
411 * Add an auxiliary data structure reference to the vnode. Auxiliary
412 * references do not change the state of the vnode or prevent deactivation
413 * or reclamation of the vnode, but will prevent the vnode from being
414 * destroyed (kfree()'d).
416 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not
417 * already be held by the caller. vdrop() will clean up the
421 vhold(struct vnode *vp)
423 atomic_add_int(&vp->v_auxrefs, 1);
427 * Remove an auxiliary reference from the vnode.
430 vdrop(struct vnode *vp)
432 atomic_add_int(&vp->v_auxrefs, -1);
436 * This function is called on the 1->0 transition (which is actually
437 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
440 * Additional vrefs are allowed to race but will not result in a reentrant
441 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This
442 * prevents additional 1->0 transitions.
444 * ONLY A VGET() CAN REACTIVATE THE VNODE.
446 * Caller must hold the VX lock.
448 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
450 * NOTE: The vnode may be marked inactive with dirty buffers
451 * or dirty pages in its cached VM object still present.
453 * NOTE: VS_FREE should not be set on entry (the vnode was expected to
454 * previously be active). We lose control of the vnode the instant
455 * it is placed on the free list.
457 * The VX lock is required when transitioning to VS_CACHED but is
458 * not sufficient for the vshouldfree() interlocked test or when
459 * transitioning away from VS_CACHED. v_spin is also required for
464 vnode_terminate(struct vnode *vp)
466 KKASSERT(vp->v_state == VS_ACTIVE);
468 if ((vp->v_flag & VINACTIVE) == 0) {
469 _vsetflags(vp, VINACTIVE);
473 spin_lock(&vp->v_spin);
475 spin_unlock(&vp->v_spin);
480 /****************************************************************
481 * VX LOCKING FUNCTIONS *
482 ****************************************************************
484 * These functions lock vnodes for reclamation and deactivation related
485 * activities. The caller must already be holding some sort of reference
489 vx_lock(struct vnode *vp)
491 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
495 vx_unlock(struct vnode *vp)
497 lockmgr(&vp->v_lock, LK_RELEASE);
500 /****************************************************************
501 * VNODE ACQUISITION FUNCTIONS *
502 ****************************************************************
504 * These functions must be used when accessing a vnode that has no
505 * chance of being destroyed in a SMP race. That means the caller will
506 * usually either hold an auxiliary reference (such as the namecache)
507 * or hold some other lock that ensures that the vnode cannot be destroyed.
509 * These functions are MANDATORY for any code chain accessing a vnode
510 * whos activation state is not known.
512 * vget() can be called with LK_NOWAIT and will return EBUSY if the
513 * lock cannot be immediately acquired.
515 * vget()/vput() are used when reactivation is desired.
517 * vx_get() and vx_put() are used when reactivation is not desired.
520 vget(struct vnode *vp, int flags)
525 * A lock type must be passed
527 if ((flags & LK_TYPE_MASK) == 0) {
528 panic("vget() called with no lock specified!");
533 * Reference the structure and then acquire the lock.
535 * NOTE: The requested lock might be a shared lock and does
536 * not protect our access to the refcnt or other fields.
538 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
539 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
541 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
543 * The lock failed, undo and return an error. This will not
544 * normally trigger a termination.
547 } else if (vp->v_flag & VRECLAIMED) {
549 * The node is being reclaimed and cannot be reactivated
550 * any more, undo and return ENOENT.
555 } else if (vp->v_state == VS_ACTIVE) {
557 * A VS_ACTIVE vnode coupled with the fact that we have
558 * a vnode lock (even if shared) prevents v_state from
559 * changing. Since the vnode is not in a VRECLAIMED state,
560 * we can safely clear VINACTIVE.
562 * It is possible for a shared lock to cause a race with
563 * another thread that is also in the process of clearing
564 * VREF_TERMINATE, meaning that we might return with it still
565 * set and then assert in a later vref(). The solution is to
566 * unconditionally clear VREF_TERMINATE here as well.
568 * NOTE! Multiple threads may clear VINACTIVE if this is
569 * shared lock. This race is allowed.
571 _vclrflags(vp, VINACTIVE); /* SMP race ok */
572 vp->v_act += VACT_INC;
573 if (vp->v_act > VACT_MAX) /* SMP race ok */
574 vp->v_act = VACT_MAX;
576 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE);
579 * If the vnode is not VS_ACTIVE it must be reactivated
580 * in addition to clearing VINACTIVE. An exclusive spin_lock
581 * is needed to manipulate the vnode's list.
583 * Because the lockmgr lock might be shared, we might race
584 * another reactivation, which we handle. In this situation,
585 * however, the refcnt prevents other v_state races.
587 * As with above, clearing VINACTIVE is allowed to race other
588 * clearings of VINACTIVE.
590 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
591 * the refcnt is non-zero and the vnode has not been
592 * reclaimed. This also means that the transitions do
593 * not affect cachedvnodes.
595 * It is possible for a shared lock to cause a race with
596 * another thread that is also in the process of clearing
597 * VREF_TERMINATE, meaning that we might return with it still
598 * set and then assert in a later vref(). The solution is to
599 * unconditionally clear VREF_TERMINATE here as well.
601 _vclrflags(vp, VINACTIVE);
602 vp->v_act += VACT_INC;
603 if (vp->v_act > VACT_MAX) /* SMP race ok */
604 vp->v_act = VACT_MAX;
605 spin_lock(&vp->v_spin);
607 switch(vp->v_state) {
610 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
612 spin_unlock(&vp->v_spin);
616 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
618 spin_unlock(&vp->v_spin);
621 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE |
623 spin_unlock(&vp->v_spin);
626 spin_unlock(&vp->v_spin);
627 panic("Impossible VS_DYING state");
638 debug_vput(struct vnode *vp, const char *filename, int line)
640 kprintf("vput(%p) %s:%d\n", vp, filename, line);
648 vput(struct vnode *vp)
657 * Acquire the vnode lock unguarded.
659 * The non-blocking version also uses a slightly different mechanic.
660 * This function will explicitly fail not only if it cannot acquire
661 * the lock normally, but also if the caller already holds a lock.
663 * The adjusted mechanic is used to close a loophole where complex
664 * VOP_RECLAIM code can circle around recursively and allocate the
665 * same vnode it is trying to destroy from the freelist.
667 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
668 * cause the incorrect behavior to occur. If not for that lockmgr()
669 * would do the right thing.
671 * XXX The vx_*() locks should use auxrefs, not the main reference counter.
674 vx_get(struct vnode *vp)
676 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
677 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
678 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
682 vx_get_nonblock(struct vnode *vp)
686 if (lockinuse(&vp->v_lock))
688 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
690 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
691 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
697 * Release a VX lock that also held a ref on the vnode. vrele() will handle
698 * any needed state transitions.
700 * However, filesystems use this function to get rid of unwanted new vnodes
701 * so try to get the vnode on the correct queue in that case.
704 vx_put(struct vnode *vp)
706 if (vp->v_type == VNON || vp->v_type == VBAD)
707 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
708 lockmgr(&vp->v_lock, LK_RELEASE);
713 * Try to reuse a vnode from the free list. This function is somewhat
714 * advisory in that NULL can be returned as a normal case, even if free
715 * vnodes are present.
717 * The scan is limited because it can result in excessive CPU use during
718 * periods of extreme vnode use.
720 * NOTE: The returned vnode is not completely initialized.
724 cleanfreevnode(int maxcount)
726 struct vnode_index *vi;
729 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
734 * Try to deactivate some vnodes cached on the active list.
736 if (countcachedvnodes() < inactivevnodes)
739 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
741 for (count = 0; count < maxcount * 2; ++count, ++ri) {
742 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
744 spin_lock(&vi->spin);
746 vp = TAILQ_NEXT(&vi->active_rover, v_list);
747 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
749 TAILQ_INSERT_HEAD(&vi->active_list,
750 &vi->active_rover, v_list);
752 TAILQ_INSERT_AFTER(&vi->active_list, vp,
753 &vi->active_rover, v_list);
756 spin_unlock(&vi->spin);
759 if ((vp->v_refcnt & VREF_MASK) != 0) {
760 spin_unlock(&vi->spin);
761 vp->v_act += VACT_INC;
762 if (vp->v_act > VACT_MAX) /* SMP race ok */
763 vp->v_act = VACT_MAX;
768 * decrement by less if the vnode's object has a lot of
769 * VM pages. XXX possible SMP races.
773 if ((obj = vp->v_object) != NULL &&
774 obj->resident_page_count >= trigger) {
777 vp->v_act -= VACT_INC;
781 spin_unlock(&vi->spin);
786 * Try to deactivate the vnode.
788 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
789 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
790 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
792 spin_unlock(&vi->spin);
796 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
800 * Loop trying to lock the first vnode on the free list.
804 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
806 for (count = 0; count < maxcount; ++count, ++ri) {
807 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
809 spin_lock(&vi->spin);
811 vp = TAILQ_FIRST(&vi->inactive_list);
813 spin_unlock(&vi->spin);
814 if (--cpu_count == 0)
816 ri = (ri + 16) & ~15;
822 * non-blocking vx_get will also ref the vnode on success.
824 if (vx_get_nonblock(vp)) {
825 KKASSERT(vp->v_state == VS_INACTIVE);
826 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
827 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
828 spin_unlock(&vi->spin);
833 * Because we are holding vfs_spin the vnode should currently
834 * be inactive and VREF_TERMINATE should still be set.
836 * Once vfs_spin is released the vnode's state should remain
837 * unmodified due to both the lock and ref on it.
839 KKASSERT(vp->v_state == VS_INACTIVE);
840 spin_unlock(&vi->spin);
842 if ((u_long)vp == trackvnode)
843 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
847 * Do not reclaim/reuse a vnode while auxillary refs exists.
848 * This includes namecache refs due to a related ncp being
849 * locked or having children, a VM object association, or
852 * Do not reclaim/reuse a vnode if someone else has a real
853 * ref on it. This can occur if a filesystem temporarily
854 * releases the vnode lock during VOP_RECLAIM.
857 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
859 if (vp->v_state == VS_INACTIVE) {
860 spin_lock(&vi->spin);
861 if (vp->v_state == VS_INACTIVE) {
862 TAILQ_REMOVE(&vi->inactive_list,
864 TAILQ_INSERT_TAIL(&vi->inactive_list,
867 spin_unlock(&vi->spin);
874 * VINACTIVE and VREF_TERMINATE are expected to both be set
875 * for vnodes pulled from the inactive list, and cannot be
876 * changed while we hold the vx lock.
878 * Try to reclaim the vnode.
880 KKASSERT(vp->v_flag & VINACTIVE);
881 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
883 if ((vp->v_flag & VRECLAIMED) == 0) {
884 if (cache_inval_vp_nonblock(vp))
887 /* vnode is still VX locked */
891 * At this point if there are no other refs or auxrefs on
892 * the vnode with the inactive list locked, and we remove
893 * the vnode from the inactive list, it should not be
894 * possible for anyone else to access the vnode any more.
896 * Since the vnode is in a VRECLAIMED state, no new
897 * namecache associations could have been made and the
898 * vnode should have already been removed from its mountlist.
900 * Since we hold a VX lock on the vnode it cannot have been
901 * reactivated (moved out of the inactive list).
903 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
904 spin_lock(&vi->spin);
906 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
907 spin_unlock(&vi->spin);
910 KKASSERT(vp->v_state == VS_INACTIVE);
911 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
912 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
913 vp->v_state = VS_DYING;
914 spin_unlock(&vi->spin);
917 * Nothing should have been able to access this vp. Only
918 * our ref should remain now.
920 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
921 KASSERT(vp->v_refcnt == 1,
922 ("vp %p badrefs %08x", vp, vp->v_refcnt));
925 * Return a VX locked vnode suitable for reuse.
927 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
930 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
935 * Obtain a new vnode. The returned vnode is VX locked & vrefd.
937 * All new vnodes set the VAGE flags. An open() of the vnode will
938 * decrement the (2-bit) flags. Vnodes which are opened several times
939 * are thus retained in the cache over vnodes which are merely stat()d.
941 * We attempt to reuse an already-recycled vnode from our pcpu inactive
942 * queue first, and allocate otherwise. Attempting to recycle inactive
943 * vnodes here can lead to numerous deadlocks, particularly with
947 allocvnode(int lktimeout, int lkflags)
950 struct vnode_index *vi;
953 * lktimeout only applies when LK_TIMELOCK is used, and only
954 * the pageout daemon uses it. The timeout may not be zero
955 * or the pageout daemon can deadlock in low-VM situations.
961 * Do not flag for synchronous recyclement unless there are enough
962 * freeable vnodes to recycle and the number of vnodes has
963 * significantly exceeded our target. We want the normal vnlru
964 * process to handle the cleaning (at 9/10's) before we are forced
965 * to flag it here at 11/10's for userexit path processing.
967 if (numvnodes >= maxvnodes * 11 / 10 &&
968 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
969 struct thread *td = curthread;
971 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
975 * Try to trivially reuse a reclaimed vnode from the head of the
976 * inactive list for this cpu. Any vnode cycling which occurs
977 * which terminates the vnode will cause it to be returned to the
978 * same pcpu structure (e.g. unlink calls).
980 vi = &vnode_list_hash[mycpuid];
981 spin_lock(&vi->spin);
983 vp = TAILQ_FIRST(&vi->inactive_list);
984 if (vp && (vp->v_flag & VRECLAIMED)) {
986 * non-blocking vx_get will also ref the vnode on success.
988 if (vx_get_nonblock(vp)) {
989 KKASSERT(vp->v_state == VS_INACTIVE);
990 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
991 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
992 spin_unlock(&vi->spin);
997 * Because we are holding vfs_spin the vnode should currently
998 * be inactive and VREF_TERMINATE should still be set.
1000 * Once vfs_spin is released the vnode's state should remain
1001 * unmodified due to both the lock and ref on it.
1003 KKASSERT(vp->v_state == VS_INACTIVE);
1005 if ((u_long)vp == trackvnode)
1006 kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1010 * Do not reclaim/reuse a vnode while auxillary refs exists.
1011 * This includes namecache refs due to a related ncp being
1012 * locked or having children, a VM object association, or
1015 * Do not reclaim/reuse a vnode if someone else has a real
1016 * ref on it. This can occur if a filesystem temporarily
1017 * releases the vnode lock during VOP_RECLAIM.
1019 if (vp->v_auxrefs ||
1020 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1021 if (vp->v_state == VS_INACTIVE) {
1022 if (vp->v_state == VS_INACTIVE) {
1023 TAILQ_REMOVE(&vi->inactive_list,
1025 TAILQ_INSERT_TAIL(&vi->inactive_list,
1029 spin_unlock(&vi->spin);
1035 * VINACTIVE and VREF_TERMINATE are expected to both be set
1036 * for vnodes pulled from the inactive list, and cannot be
1037 * changed while we hold the vx lock.
1039 * Try to reclaim the vnode.
1041 KKASSERT(vp->v_flag & VINACTIVE);
1042 KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1044 if ((vp->v_flag & VRECLAIMED) == 0) {
1045 spin_unlock(&vi->spin);
1051 * At this point if there are no other refs or auxrefs on
1052 * the vnode with the inactive list locked, and we remove
1053 * the vnode from the inactive list, it should not be
1054 * possible for anyone else to access the vnode any more.
1056 * Since the vnode is in a VRECLAIMED state, no new
1057 * namecache associations could have been made and the
1058 * vnode should have already been removed from its mountlist.
1060 * Since we hold a VX lock on the vnode it cannot have been
1061 * reactivated (moved out of the inactive list).
1063 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1064 KKASSERT(vp->v_state == VS_INACTIVE);
1065 TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1066 atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1067 vp->v_state = VS_DYING;
1068 spin_unlock(&vi->spin);
1071 * Nothing should have been able to access this vp. Only
1072 * our ref should remain now.
1074 * At this point we can kfree() the vnode if we want to.
1075 * Instead, we reuse it for the allocation.
1077 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1078 KASSERT(vp->v_refcnt == 1,
1079 ("vp %p badrefs %08x", vp, vp->v_refcnt));
1080 bzero(vp, sizeof(*vp));
1082 spin_unlock(&vi->spin);
1084 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1085 atomic_add_int(&numvnodes, 1);
1088 lwkt_token_init(&vp->v_token, "vnode");
1089 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1090 TAILQ_INIT(&vp->v_namecache);
1091 RB_INIT(&vp->v_rbclean_tree);
1092 RB_INIT(&vp->v_rbdirty_tree);
1093 RB_INIT(&vp->v_rbhash_tree);
1094 spin_init(&vp->v_spin, "allocvnode");
1096 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
1098 vp->v_flag = VAGE0 | VAGE1;
1099 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1101 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1102 /* exclusive lock still held */
1104 vp->v_filesize = NOOFFSET;
1107 vp->v_state = VS_CACHED;
1114 * Called after a process has allocated a vnode via allocvnode()
1115 * and we detected that too many vnodes were present.
1117 * This function is called just prior to a return to userland if the
1118 * process at some point had to allocate a new vnode during the last
1119 * system call and the vnode count was found to be excessive.
1121 * This is a synchronous path that we do not normally want to execute.
1123 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1125 * WARNING: Sometimes numvnodes can blow out due to children being
1126 * present under directory vnodes in the namecache. For the
1127 * moment use an if() instead of a while() and note that if
1128 * we were to use a while() we would still have to break out
1129 * if freesomevnodes() returned 0. vnlru will also be trying
1130 * hard to free vnodes at the same time (with a lower trigger
1136 if (numvnodes >= maxvnodes &&
1137 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) {
1138 freesomevnodes(batchfreevnodes);
1143 freesomevnodes(int n)
1149 if ((vp = cleanfreevnode(n)) == NULL)
1155 atomic_add_int(&numvnodes, -1);