2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/vfs_lock.c,v 1.24 2006/09/05 00:55:45 dillon Exp $
38 * External virtual filesystem routines
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
48 #include <sys/vnode.h>
50 #include <sys/sysctl.h>
52 #include <machine/limits.h>
55 #include <vm/vm_object.h>
58 #include <sys/thread2.h>
61 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
63 static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
66 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD,
68 static int wantfreevnodes = 25;
69 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
70 &wantfreevnodes, 0, "");
72 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
73 &minvnodes, 0, "Minimum number of vnodes");
76 * Called from vfsinit()
81 minvnodes = desiredvnodes / 4;
83 TAILQ_INIT(&vnode_free_list);
87 * Inline helper functions. vbusy() and vfree() must be called while in a
90 * Warning: must be callable if the caller holds a read spinlock to something
91 * else, meaning we can't use read spinlocks here.
95 __vbusy(struct vnode *vp)
97 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
99 vp->v_flag &= ~(VFREE|VAGE);
104 __vfree(struct vnode *vp)
106 if (vp->v_flag & (VAGE|VRECLAIMED))
107 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
109 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
116 * Return 1 if we can immediately place the vnode on the freelist.
119 vshouldfree(struct vnode *vp, int usecount)
121 if (vp->v_flag & VFREE)
122 return (0); /* already free */
123 if (vp->v_holdcnt != 0 || vp->v_usecount != usecount)
124 return (0); /* other holderse */
126 (vp->v_object->ref_count || vp->v_object->resident_page_count)) {
133 * Add another ref to a vnode. The vnode must already have at least one
136 * NOTE: The vnode may continue to reside on the free list
139 vref(struct vnode *vp)
141 KKASSERT(vp->v_usecount > 0 && (vp->v_flag & VINACTIVE) == 0);
142 atomic_add_int(&vp->v_usecount, 1);
146 * Add a ref to a vnode which may not have any refs. This routine is called
147 * from the namecache and vx_get(). If requested, the vnode will be
150 * Removal of the vnode from the free list is optional. Since most vnodes
151 * are temporary in nature we opt not do it. This also means we don't have
152 * to deal with lock ordering issues between the freelist and vnode
155 * We must acquire the vnode's spinlock to interlock against vrele().
157 * vget(), cache_vget(), and cache_vref() reactives vnodes. vx_get() does
161 vref_initial(struct vnode *vp, int reactivate)
163 spin_lock_wr(&vp->v_spinlock);
164 atomic_add_int(&vp->v_usecount, 1);
166 vp->v_flag &= ~VINACTIVE;
167 spin_unlock_wr(&vp->v_spinlock);
171 * Release a ref on the vnode. Since 0->1 transitions can only be made
172 * by vref_initial(), 1->0 transitions will be protected by the spinlock.
174 * When handling a 1->0 transition the vnode is guarenteed to not be locked
175 * and we can set the exclusive lock atomically while interlocked with our
176 * spinlock. A panic will occur if the lock is held.
179 vrele(struct vnode *vp)
181 spin_lock_wr(&vp->v_spinlock);
182 if (vp->v_usecount > 1) {
183 atomic_subtract_int(&vp->v_usecount, 1);
184 spin_unlock_wr(&vp->v_spinlock);
187 KKASSERT(vp->v_usecount == 1);
190 * This is roughly equivalent to obtaining an exclusive
191 * lock, but the spinlock is already held (and remains held
192 * on return) and the lock must be obtainable without
193 * blocking, which it is in a 1->0 transition.
195 lockmgr_setexclusive_interlocked(&vp->v_lock);
198 * VINACTIVE is interlocked by the spinlock, so we have to re-check
199 * the bit if we release and reacquire the spinlock even though
200 * we are holding the exclusive lockmgr lock throughout.
202 * VOP_INACTIVE can race other VOPs even though we hold an exclusive
203 * lock. This is ok. The ref count of 1 must remain intact through
204 * the VOP_INACTIVE call to avoid a recursion.
206 while ((vp->v_flag & VINACTIVE) == 0 && vp->v_usecount == 1) {
207 vp->v_flag |= VINACTIVE;
208 spin_unlock_wr(&vp->v_spinlock);
210 spin_lock_wr(&vp->v_spinlock);
214 * NOTE: v_usecount might no longer be 1
216 atomic_subtract_int(&vp->v_usecount, 1);
217 if (vshouldfree(vp, 0))
219 lockmgr_clrexclusive_interlocked(&vp->v_lock);
220 /* spinlock unlocked */
224 * Hold a vnode, preventing it from being recycled (unless it is already
225 * undergoing a recyclement or already has been recycled).
227 * Opting not to remove a vnode from the freelist simply means that
228 * allocvnode must do it for us if it finds an unsuitable vnode.
231 vhold(struct vnode *vp)
233 spin_lock_wr(&vp->v_spinlock);
234 atomic_add_int(&vp->v_holdcnt, 1);
235 spin_unlock_wr(&vp->v_spinlock);
239 * Like vrele(), we must atomically place the vnode on the free list if
240 * it becomes suitable. vhold()/vdrop() do not mess with VINACTIVE.
243 vdrop(struct vnode *vp)
245 KKASSERT(vp->v_holdcnt > 0);
246 spin_lock_wr(&vp->v_spinlock);
247 atomic_subtract_int(&vp->v_holdcnt, 1);
248 if (vshouldfree(vp, 0))
250 spin_unlock_wr(&vp->v_spinlock);
253 /****************************************************************
254 * VX LOCKING FUNCTIONS *
255 ****************************************************************
257 * These functions lock vnodes for reclamation and deactivation related
258 * activities. Only vp->v_lock, the top layer of the VFS, is locked.
259 * You must be holding a normal reference in order to be able to safely
260 * call vx_lock() and vx_unlock().
262 * vx_get() also differs from vget() in that it does not clear the
263 * VINACTIVE bit on a vnode.
267 vx_lock(struct vnode *vp)
269 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
273 vx_unlock(struct vnode *vp)
275 lockmgr(&vp->v_lock, LK_RELEASE);
279 vx_get(struct vnode *vp)
282 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
286 vx_get_nonblock(struct vnode *vp)
291 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
298 vx_put(struct vnode *vp)
300 lockmgr(&vp->v_lock, LK_RELEASE);
304 /****************************************************************
305 * VNODE ACQUISITION FUNCTIONS *
306 ****************************************************************
308 * vget() and vput() access a vnode for the intent of executing an
309 * operation other then a reclamation or deactivation. vget() will ref
310 * and lock the vnode, vput() will unlock and deref the vnode.
311 * The VOP_*() locking functions are used.
313 * CALLING VGET IS MANDATORY PRIOR TO ANY MODIFYING OPERATION ON A VNODE.
314 * This is because vget handles the VINACTIVE interlock and is responsible
315 * for clearing the bit. If the bit is not cleared inode updates may not
318 * Special cases: If vget()'s locking operation fails the vrele() call may
319 * cause the vnode to be deactivated (VOP_INACTIVE called). However, this
320 * never occurs if the vnode is in a reclaimed state. Vnodes in reclaimed
321 * states always return an error code of ENOENT.
323 * Special cases: vput() will unlock and, if it is the last reference,
324 * deactivate the vnode. The deactivation uses a separate non-layered
325 * VX lock after the normal unlock. XXX make it more efficient.
328 vget(struct vnode *vp, int flags)
333 if (flags & LK_TYPE_MASK) {
334 if ((error = vn_lock(vp, flags)) != 0) {
336 } else if (vp->v_flag & VRECLAIMED) {
341 vp->v_flag &= ~VINACTIVE; /* XXX not MP safe */
345 panic("vget() called with no lock specified!");
346 error = ENOENT; /* not reached, compiler opt */
352 vput(struct vnode *vp)
359 vsetflags(struct vnode *vp, int flags)
367 vclrflags(struct vnode *vp, int flags)
370 vp->v_flag &= ~flags;
375 * Obtain a new vnode from the freelist, allocating more if necessary.
376 * The returned vnode is VX locked & refd.
379 allocvnode(int lktimeout, int lkflags)
385 * Try to reuse vnodes if we hit the max. This situation only
386 * occurs in certain large-memory (2G+) situations. We cannot
387 * attempt to directly reclaim vnodes due to nasty recursion
390 while (numvnodes - freevnodes > desiredvnodes)
397 * Attempt to reuse a vnode already on the free list, allocating
398 * a new vnode if we can't find one or if we have not reached a
399 * good minimum for good LRU performance.
401 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
404 for (count = 0; count < freevnodes; count++) {
408 * Pull the next vnode off the free list and do some
409 * sanity checks. Note that regardless of how we
410 * block, if freevnodes is non-zero there had better
411 * be something on the list.
413 vp = TAILQ_FIRST(&vnode_free_list);
415 panic("getnewvnode: free vnode isn't");
418 KKASSERT(vp->v_flag & VFREE);
421 * Handle the case where the vnode was pulled off
422 * the free list while we were waiting for the
425 spin_lock_wr(&vp->v_spinlock);
426 if ((vp->v_flag & VFREE) == 0) {
427 spin_unlock_wr(&vp->v_spinlock);
433 * Lazy removal of the vnode from the freelist if
434 * the vnode has references.
436 if (vp->v_usecount || vp->v_holdcnt) {
438 spin_unlock_wr(&vp->v_spinlock);
444 * vx_get() equivalent, but atomic with the
445 * spinlock held. Since 0->1 transitions and the
446 * lockmgr are protected by the spinlock we must
447 * be able to get an exclusive lock without blocking
450 * Also take the vnode off of the free list and
451 * assert that it is inactive.
454 lockmgr_setexclusive_interlocked(&vp->v_lock);
456 KKASSERT(vp->v_flag & VINACTIVE);
459 * Reclaim the vnode. VRECLAIMED will be set
460 * atomically before the spinlock is released
461 * by vgone_interlocked().
463 if ((vp->v_flag & VRECLAIMED) == 0) {
464 vgone_interlocked(vp);
465 /* spinlock unlocked */
467 spin_unlock_wr(&vp->v_spinlock);
471 * We reclaimed the vnode but other claimants may
472 * have referenced it while we were blocked. We
473 * cannot reuse a vnode until all refs are gone and
474 * the vnode has completed reclamation.
476 KKASSERT(vp->v_flag & VRECLAIMED);
477 if (vp->v_usecount != 1 || vp->v_holdcnt) {
484 * There are no more structural references to the
485 * vnode, referenced or otherwise. We have a vnode!
487 * The vnode may have been placed on the free list
488 * while we were blocked.
490 if (vp->v_flag & VFREE)
492 KKASSERT(vp->v_flag & VINACTIVE);
498 * If we have a vp it will be refd and VX locked.
503 panic("cleaned vnode isn't");
504 if (vp->v_track_read.bk_active + vp->v_track_write.bk_active)
505 panic("Clean vnode has pending I/O's");
506 KKASSERT(vp->v_mount == NULL);
515 vp->v_writecount = 0; /* XXX */
516 lockreinit(&vp->v_lock, "vnode", lktimeout, lkflags);
517 KKASSERT(TAILQ_FIRST(&vp->v_namecache) == NULL);
520 * A brand-new vnode (we could use malloc() here I think) XXX
522 vp = kmalloc(sizeof(struct vnode), M_VNODE, M_WAITOK|M_ZERO);
523 lwkt_token_init(&vp->v_pollinfo.vpi_token);
524 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
525 ccms_dataspace_init(&vp->v_ccms);
526 TAILQ_INIT(&vp->v_namecache);
529 * short cut around vfreeing it and looping, just set it up
530 * as if we had pulled a reclaimed vnode off the freelist
531 * and reinitialized it.
534 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
538 RB_INIT(&vp->v_rbclean_tree);
539 RB_INIT(&vp->v_rbdirty_tree);
540 RB_INIT(&vp->v_rbhash_tree);
541 vp->v_filesize = NOOFFSET;
546 KKASSERT(vp->v_mount == NULL);