kernel - Provide descriptions for lwkt.* and debug.* sysctl's
[dragonfly.git] / sys / kern / vfs_lock.c
CommitLineData
5fd012e0
MD
1/*
2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
f043c4c7 34 * $DragonFly: src/sys/kern/vfs_lock.c,v 1.30 2008/06/30 03:57:41 dillon Exp $
5fd012e0
MD
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40#include "opt_ddb.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/malloc.h>
46#include <sys/mount.h>
47#include <sys/proc.h>
48#include <sys/vnode.h>
49#include <sys/buf.h>
50#include <sys/sysctl.h>
51
52#include <machine/limits.h>
53
54#include <vm/vm.h>
55#include <vm/vm_object.h>
56
57#include <sys/buf2.h>
58#include <sys/thread2.h>
3c37c940 59#include <sys/sysref2.h>
cd8ab232 60#include <sys/mplock2.h>
5fd012e0 61
3c37c940
MD
62static void vnode_terminate(struct vnode *vp);
63static boolean_t vnode_ctor(void *obj, void *private, int ocflags);
64static void vnode_dtor(void *obj, void *private);
5fd012e0
MD
65
66static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
3c37c940
MD
67static struct sysref_class vnode_sysref_class = {
68 .name = "vnode",
69 .mtype = M_VNODE,
70 .proto = SYSREF_PROTO_VNODE,
71 .offset = offsetof(struct vnode, v_sysref),
72 .objsize = sizeof(struct vnode),
73 .mag_capacity = 256,
74 .flags = SRC_MANAGEDINIT,
75 .ctor = vnode_ctor,
76 .dtor = vnode_dtor,
77 .ops = {
e654922c
MD
78 .terminate = (sysref_terminate_func_t)vnode_terminate,
79 .lock = (sysref_terminate_func_t)vx_lock,
80 .unlock = (sysref_terminate_func_t)vx_unlock
3c37c940
MD
81 }
82};
5fd012e0 83
0e8bd897
MD
84/*
85 * The vnode free list hold inactive vnodes. Aged inactive vnodes
86 * are inserted prior to the mid point, and otherwise inserted
87 * at the tail.
88 */
89static TAILQ_HEAD(freelst, vnode) vnode_free_list;
04bd6171
MD
90static struct vnode vnode_free_mid1;
91static struct vnode vnode_free_mid2;
92static struct vnode vnode_free_rover;
2247fe02 93static struct spinlock vfs_spin = SPINLOCK_INITIALIZER(vfs_spin);
04bd6171 94static enum { ROVER_MID1, ROVER_MID2 } rover_state = ROVER_MID2;
5fd012e0
MD
95
96int freevnodes = 0;
97SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD,
0c52fa62 98 &freevnodes, 0, "Number of free nodes");
5fd012e0
MD
99static int wantfreevnodes = 25;
100SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
0c52fa62 101 &wantfreevnodes, 0, "Desired number of free vnodes");
0e8bd897
MD
102#ifdef TRACKVNODE
103static ulong trackvnode;
104SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
105 &trackvnode, 0, "");
106#endif
5fd012e0
MD
107
108/*
109 * Called from vfsinit()
110 */
111void
112vfs_lock_init(void)
113{
5fd012e0 114 TAILQ_INIT(&vnode_free_list);
04bd6171
MD
115 TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_mid1, v_freelist);
116 TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_mid2, v_freelist);
117 TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_rover, v_freelist);
2247fe02 118 spin_init(&vfs_spin);
7c457ac8 119 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */
5fd012e0
MD
120}
121
122/*
2247fe02
MD
123 * Misc functions
124 */
125static __inline
126void
127_vsetflags(struct vnode *vp, int flags)
128{
129 atomic_set_int(&vp->v_flag, flags);
130}
131
132static __inline
133void
134_vclrflags(struct vnode *vp, int flags)
135{
136 atomic_clear_int(&vp->v_flag, flags);
137}
138
139void
140vsetflags(struct vnode *vp, int flags)
141{
142 _vsetflags(vp, flags);
143}
144
145void
146vclrflags(struct vnode *vp, int flags)
147{
148 _vclrflags(vp, flags);
149}
150
151/*
ac88f01a 152 * Inline helper functions.
2247fe02 153 *
ac88f01a
MD
154 * WARNING: vbusy() may only be called while the vnode lock or VX lock
155 * is held. The vnode spinlock need not be held.
5b287bba 156 *
2247fe02 157 * MPSAFE
5fd012e0 158 */
ac88f01a
MD
159static __inline
160void
161__vbusy_interlocked(struct vnode *vp)
162{
4f51b8ae 163 KKASSERT(vp->v_flag & VFREE);
ac88f01a
MD
164 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
165 freevnodes--;
166 _vclrflags(vp, VFREE);
167}
168
5fd012e0
MD
169static __inline
170void
171__vbusy(struct vnode *vp)
172{
0e8bd897
MD
173#ifdef TRACKVNODE
174 if ((ulong)vp == trackvnode)
175 kprintf("__vbusy %p %08x\n", vp, vp->v_flag);
176#endif
287a8577 177 spin_lock(&vfs_spin);
ac88f01a 178 __vbusy_interlocked(vp);
287a8577 179 spin_unlock(&vfs_spin);
5fd012e0
MD
180}
181
2247fe02 182/*
ac88f01a
MD
183 * Put a vnode on the free list. The caller has cleared VCACHED or owns the
184 * implied sysref related to having removed the vnode from the freelist
185 * (and VCACHED is already clear in that case).
2247fe02
MD
186 *
187 * MPSAFE
188 */
5fd012e0
MD
189static __inline
190void
191__vfree(struct vnode *vp)
192{
0e8bd897
MD
193#ifdef TRACKVNODE
194 if ((ulong)vp == trackvnode) {
195 kprintf("__vfree %p %08x\n", vp, vp->v_flag);
7ce2998e 196 print_backtrace(-1);
0e8bd897
MD
197 }
198#endif
287a8577 199 spin_lock(&vfs_spin);
4f51b8ae 200 KKASSERT((vp->v_flag & VFREE) == 0);
04bd6171
MD
201
202 /*
203 * Distinguish between basically dead vnodes, vnodes with cached
204 * data, and vnodes without cached data. A rover will shift the
205 * vnodes around as their cache status is lost.
206 */
207 if (vp->v_flag & VRECLAIMED) {
5fd012e0 208 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
04bd6171 209 } else if (vp->v_object && vp->v_object->resident_page_count) {
5fd012e0 210 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
04bd6171
MD
211 } else if (vp->v_object && vp->v_object->swblock_count) {
212 TAILQ_INSERT_BEFORE(&vnode_free_mid2, vp, v_freelist);
213 } else {
214 TAILQ_INSERT_BEFORE(&vnode_free_mid1, vp, v_freelist);
215 }
5fd012e0 216 freevnodes++;
2247fe02 217 _vsetflags(vp, VFREE);
287a8577 218 spin_unlock(&vfs_spin);
5fd012e0
MD
219}
220
2247fe02 221/*
ac88f01a
MD
222 * Put a vnode on the free list. The caller has cleared VCACHED or owns the
223 * implied sysref related to having removed the vnode from the freelist
224 * (and VCACHED is already clear in that case).
2247fe02
MD
225 *
226 * MPSAFE
227 */
3c37c940
MD
228static __inline
229void
230__vfreetail(struct vnode *vp)
231{
0e8bd897
MD
232#ifdef TRACKVNODE
233 if ((ulong)vp == trackvnode)
234 kprintf("__vfreetail %p %08x\n", vp, vp->v_flag);
235#endif
287a8577 236 spin_lock(&vfs_spin);
4f51b8ae 237 KKASSERT((vp->v_flag & VFREE) == 0);
3c37c940
MD
238 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
239 freevnodes++;
2247fe02 240 _vsetflags(vp, VFREE);
287a8577 241 spin_unlock(&vfs_spin);
3c37c940
MD
242}
243
5fd012e0 244/*
3c37c940
MD
245 * Return a C boolean if we should put the vnode on the freelist (VFREE),
246 * or leave it / mark it as VCACHED.
247 *
248 * This routine is only valid if the vnode is already either VFREE or
249 * VCACHED, or if it can become VFREE or VCACHED via vnode_terminate().
2247fe02
MD
250 *
251 * WARNING! This functions is typically called with v_spinlock held.
252 *
253 * MPSAFE
5fd012e0 254 */
3c37c940
MD
255static __inline boolean_t
256vshouldfree(struct vnode *vp)
5fd012e0 257{
3c37c940
MD
258 return (vp->v_auxrefs == 0 &&
259 (vp->v_object == NULL || vp->v_object->resident_page_count == 0));
5fd012e0
MD
260}
261
262/*
3c37c940
MD
263 * Add a ref to an active vnode. This function should never be called
264 * with an inactive vnode (use vget() instead).
2247fe02
MD
265 *
266 * MPSAFE
5b287bba 267 */
5fd012e0
MD
268void
269vref(struct vnode *vp)
270{
3c37c940
MD
271 KKASSERT(vp->v_sysref.refcnt > 0 &&
272 (vp->v_flag & (VFREE|VINACTIVE)) == 0);
273 sysref_get(&vp->v_sysref);
44b1cf3d
MD
274}
275
276/*
3c37c940
MD
277 * Release a ref on an active or inactive vnode. The sysref termination
278 * function will be called when the active last active reference is released,
279 * and the vnode is returned to the objcache when the last inactive
280 * reference is released.
281 */
282void
283vrele(struct vnode *vp)
284{
285 sysref_put(&vp->v_sysref);
286}
287
288/*
289 * Add an auxiliary data structure reference to the vnode. Auxiliary
290 * references do not change the state of the vnode or prevent them
ac88f01a
MD
291 * from being deactivated, reclaimed, or placed on or removed from
292 * the free list.
e3332475 293 *
3c37c940
MD
294 * An auxiliary reference DOES prevent the vnode from being destroyed,
295 * allowing you to vx_lock() it, test state, etc.
44b1cf3d 296 *
3c37c940
MD
297 * An auxiliary reference DOES NOT move a vnode out of the VFREE state
298 * once it has entered it.
b1c20cfa 299 *
ac88f01a
MD
300 * WARNING! vhold() and vhold_interlocked() must not acquire v_spinlock.
301 * The spinlock may or may not already be held by the caller.
302 * vdrop() will clean up the free list state.
303 *
b1c20cfa 304 * MPSAFE
44b1cf3d
MD
305 */
306void
3c37c940 307vhold(struct vnode *vp)
44b1cf3d 308{
3c37c940
MD
309 KKASSERT(vp->v_sysref.refcnt != 0);
310 atomic_add_int(&vp->v_auxrefs, 1);
5fd012e0
MD
311}
312
ac88f01a
MD
313void
314vhold_interlocked(struct vnode *vp)
315{
316 atomic_add_int(&vp->v_auxrefs, 1);
317}
318
e3332475 319/*
3c37c940 320 * Remove an auxiliary reference from the vnode.
e3332475 321 *
3c37c940 322 * vdrop needs to check for a VCACHE->VFREE transition to catch cases
ac88f01a
MD
323 * where a vnode is held past its reclamation. We use v_spinlock to
324 * interlock VCACHED -> !VCACHED transitions.
2247fe02
MD
325 *
326 * MPSAFE
e3332475 327 */
5fd012e0 328void
3c37c940 329vdrop(struct vnode *vp)
5fd012e0 330{
3c37c940 331 KKASSERT(vp->v_sysref.refcnt != 0 && vp->v_auxrefs > 0);
287a8577 332 spin_lock(&vp->v_spinlock);
3c37c940
MD
333 atomic_subtract_int(&vp->v_auxrefs, 1);
334 if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
2247fe02 335 _vclrflags(vp, VCACHED);
e3332475 336 __vfree(vp);
3c37c940 337 }
287a8577 338 spin_unlock(&vp->v_spinlock);
5fd012e0
MD
339}
340
341/*
3c37c940 342 * This function is called when the last active reference on the vnode
e654922c
MD
343 * is released, typically via vrele(). SYSREF will VX lock the vnode
344 * and then give the vnode a negative ref count, indicating that it is
345 * undergoing termination or is being set aside for the cache, and one
346 * final sysref_put() is required to actually return it to the memory
347 * subsystem.
5fd012e0 348 *
e654922c
MD
349 * Additional inactive sysrefs may race us but that's ok. Reactivations
350 * cannot race us because the sysref code interlocked with the VX lock
351 * (which is held on call).
2247fe02
MD
352 *
353 * MPSAFE
5fd012e0
MD
354 */
355void
3c37c940 356vnode_terminate(struct vnode *vp)
5fd012e0 357{
e654922c
MD
358 /*
359 * We own the VX lock, it should not be possible for someone else
360 * to have reactivated the vp.
361 */
362 KKASSERT(sysref_isinactive(&vp->v_sysref));
363
364 /*
365 * Deactivate the vnode by marking it VFREE or VCACHED.
366 * The vnode can be reactivated from either state until
367 * reclaimed. These states inherit the 'last' sysref on the
368 * vnode.
369 *
370 * NOTE: There may be additional inactive references from
371 * other entities blocking on the VX lock while we hold it,
372 * but this does not prevent us from changing the vnode's
373 * state.
374 *
375 * NOTE: The vnode could already be marked inactive. XXX
376 * how?
377 *
378 * NOTE: v_mount may be NULL due to assignment to
379 * dead_vnode_vops
380 *
381 * NOTE: The vnode may be marked inactive with dirty buffers
382 * or dirty pages in its cached VM object still present.
383 *
384 * NOTE: VCACHED should not be set on entry. We lose control
385 * of the sysref the instant the vnode is placed on the
386 * free list or when VCACHED is set.
387 *
4f51b8ae
MD
388 * The VX lock is required when transitioning to
389 * +VCACHED but is not sufficient for the vshouldfree()
390 * interlocked test or when transitioning to -VCACHED.
e654922c
MD
391 */
392 if ((vp->v_flag & VINACTIVE) == 0) {
393 _vsetflags(vp, VINACTIVE);
394 if (vp->v_mount)
395 VOP_INACTIVE(vp);
3c37c940 396 }
287a8577 397 spin_lock(&vp->v_spinlock);
e654922c
MD
398 KKASSERT((vp->v_flag & (VFREE|VCACHED)) == 0);
399 if (vshouldfree(vp))
400 __vfree(vp);
401 else
402 _vsetflags(vp, VCACHED); /* inactive but not yet free*/
287a8577 403 spin_unlock(&vp->v_spinlock);
e654922c 404 vx_unlock(vp);
5fd012e0
MD
405}
406
e3332475 407/*
3c37c940
MD
408 * Physical vnode constructor / destructor. These are only executed on
409 * the backend of the objcache. They are NOT executed on every vnode
410 * allocation or deallocation.
2247fe02
MD
411 *
412 * MPSAFE
e3332475 413 */
3c37c940
MD
414boolean_t
415vnode_ctor(void *obj, void *private, int ocflags)
416{
417 struct vnode *vp = obj;
418
b37f18d6 419 lwkt_token_init(&vp->v_token, 1, "vnode");
3c37c940
MD
420 lockinit(&vp->v_lock, "vnode", 0, 0);
421 ccms_dataspace_init(&vp->v_ccms);
422 TAILQ_INIT(&vp->v_namecache);
8f7279b9
MD
423 RB_INIT(&vp->v_rbclean_tree);
424 RB_INIT(&vp->v_rbdirty_tree);
425 RB_INIT(&vp->v_rbhash_tree);
3c37c940
MD
426 return(TRUE);
427}
428
2247fe02
MD
429/*
430 * MPSAFE
431 */
5fd012e0 432void
3c37c940 433vnode_dtor(void *obj, void *private)
5fd012e0 434{
3c37c940
MD
435 struct vnode *vp = obj;
436
4f51b8ae 437 KKASSERT((vp->v_flag & (VCACHED|VFREE)) == 0);
3c37c940 438 ccms_dataspace_destroy(&vp->v_ccms);
5fd012e0
MD
439}
440
441/****************************************************************
442 * VX LOCKING FUNCTIONS *
443 ****************************************************************
444 *
44b1cf3d 445 * These functions lock vnodes for reclamation and deactivation related
3c37c940
MD
446 * activities. The caller must already be holding some sort of reference
447 * on the vnode.
2247fe02
MD
448 *
449 * MPSAFE
5fd012e0 450 */
e3332475 451void
5fd012e0
MD
452vx_lock(struct vnode *vp)
453{
e3332475 454 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
5fd012e0
MD
455}
456
b8b1dca3
MD
457/*
458 * The non-blocking version also uses a slightly different mechanic.
459 * This function will explicitly fail not only if it cannot acquire
460 * the lock normally, but also if the caller already holds a lock.
461 *
462 * The adjusted mechanic is used to close a loophole where complex
463 * VOP_RECLAIM code can circle around recursively and allocate the
464 * same vnode it is trying to destroy from the freelist.
465 *
466 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
467 * cause the incorrect behavior to occur. If not for that lockmgr()
468 * would do the right thing.
469 */
3c37c940
MD
470static int
471vx_lock_nonblock(struct vnode *vp)
5fd012e0 472{
b8b1dca3
MD
473 if (lockcountnb(&vp->v_lock))
474 return(EBUSY);
298693f7 475 return(lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT | LK_NOSPINWAIT));
5fd012e0
MD
476}
477
e3332475 478void
3c37c940 479vx_unlock(struct vnode *vp)
5fd012e0 480{
e3332475 481 lockmgr(&vp->v_lock, LK_RELEASE);
5fd012e0
MD
482}
483
484/****************************************************************
485 * VNODE ACQUISITION FUNCTIONS *
486 ****************************************************************
487 *
3c37c940
MD
488 * These functions must be used when accessing a vnode via an auxiliary
489 * reference such as the namecache or free list, or when you wish to
490 * do a combo ref+lock sequence.
5fd012e0 491 *
3c37c940
MD
492 * These functions are MANDATORY for any code chain accessing a vnode
493 * whos activation state is not known.
632d9efa 494 *
2247fe02
MD
495 * vget() can be called with LK_NOWAIT and will return EBUSY if the
496 * lock cannot be immediately acquired.
497 *
3c37c940 498 * vget()/vput() are used when reactivation is desired.
5fd012e0 499 *
3c37c940 500 * vx_get() and vx_put() are used when reactivation is not desired.
5fd012e0
MD
501 */
502int
87de5057 503vget(struct vnode *vp, int flags)
5fd012e0
MD
504{
505 int error;
506
3c37c940
MD
507 /*
508 * A lock type must be passed
509 */
510 if ((flags & LK_TYPE_MASK) == 0) {
511 panic("vget() called with no lock specified!");
512 /* NOT REACHED */
513 }
514
515 /*
516 * Reference the structure and then acquire the lock. 0->1
517 * transitions and refs during termination are allowed here so
518 * call sysref directly.
b0911300
MD
519 *
520 * NOTE: The requested lock might be a shared lock and does
521 * not protect our access to the refcnt or other fields.
3c37c940 522 */
3c37c940
MD
523 sysref_get(&vp->v_sysref);
524 if ((error = vn_lock(vp, flags)) != 0) {
525 /*
526 * The lock failed, undo and return an error.
527 */
528 sysref_put(&vp->v_sysref);
529 } else if (vp->v_flag & VRECLAIMED) {
530 /*
531 * The node is being reclaimed and cannot be reactivated
532 * any more, undo and return ENOENT.
533 */
534 vn_unlock(vp);
535 vrele(vp);
536 error = ENOENT;
537 } else {
538 /*
539 * If the vnode is marked VFREE or VCACHED it needs to be
540 * reactivated, otherwise it had better already be active.
541 * VINACTIVE must also be cleared.
542 *
543 * In the VFREE/VCACHED case we have to throw away the
544 * sysref that was earmarking those cases and preventing
545 * the vnode from being destroyed. Our sysref is still held.
b0911300 546 *
e654922c
MD
547 * We are allowed to reactivate the vnode while we hold
548 * the VX lock, assuming it can be reactivated.
3c37c940 549 */
287a8577 550 spin_lock(&vp->v_spinlock);
3c37c940
MD
551 if (vp->v_flag & VFREE) {
552 __vbusy(vp);
b0911300 553 sysref_activate(&vp->v_sysref);
287a8577 554 spin_unlock(&vp->v_spinlock);
3c37c940 555 sysref_put(&vp->v_sysref);
3c37c940 556 } else if (vp->v_flag & VCACHED) {
2247fe02 557 _vclrflags(vp, VCACHED);
b0911300 558 sysref_activate(&vp->v_sysref);
287a8577 559 spin_unlock(&vp->v_spinlock);
3c37c940 560 sysref_put(&vp->v_sysref);
5fd012e0 561 } else {
2247fe02
MD
562 if (sysref_isinactive(&vp->v_sysref)) {
563 sysref_activate(&vp->v_sysref);
564 kprintf("Warning vp %p reactivation race\n",
565 vp);
566 }
287a8577 567 spin_unlock(&vp->v_spinlock);
5fd012e0 568 }
2247fe02 569 _vclrflags(vp, VINACTIVE);
3c37c940 570 error = 0;
5fd012e0 571 }
5fd012e0
MD
572 return(error);
573}
574
2247fe02
MD
575/*
576 * MPSAFE
577 */
5fd012e0
MD
578void
579vput(struct vnode *vp)
580{
a11aaa81 581 vn_unlock(vp);
5fd012e0
MD
582 vrele(vp);
583}
584
3c37c940
MD
585/*
586 * XXX The vx_*() locks should use auxrefs, not the main reference counter.
2247fe02
MD
587 *
588 * MPSAFE
3c37c940
MD
589 */
590void
591vx_get(struct vnode *vp)
592{
593 sysref_get(&vp->v_sysref);
594 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
595}
596
2247fe02
MD
597/*
598 * MPSAFE
599 */
3c37c940
MD
600int
601vx_get_nonblock(struct vnode *vp)
602{
603 int error;
604
605 sysref_get(&vp->v_sysref);
606 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
607 if (error)
608 sysref_put(&vp->v_sysref);
609 return(error);
610}
611
612/*
613 * Relase a VX lock that also held a ref on the vnode.
614 *
4f51b8ae 615 * vx_put needs to check for a VCACHED->VFREE transition to catch the
3c37c940 616 * case where e.g. vnlru issues a vgone*().
2247fe02
MD
617 *
618 * MPSAFE
3c37c940
MD
619 */
620void
621vx_put(struct vnode *vp)
622{
287a8577 623 spin_lock(&vp->v_spinlock);
3c37c940 624 if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
2247fe02 625 _vclrflags(vp, VCACHED);
3c37c940
MD
626 __vfree(vp);
627 }
287a8577 628 spin_unlock(&vp->v_spinlock);
3c37c940
MD
629 lockmgr(&vp->v_lock, LK_RELEASE);
630 sysref_put(&vp->v_sysref);
631}
632
633/*
04bd6171
MD
634 * The rover looks for vnodes past the midline with no cached data and
635 * moves them to before the midline. If we do not do this the midline
636 * can wind up in a degenerate state.
637 */
638static
639void
640vnode_rover_locked(void)
641{
642 struct vnode *vp;
643
644 /*
645 * Get the vnode after the rover. The rover roves between mid1 and
646 * the end so the only special vnode it can encounter is mid2.
647 */
648 vp = TAILQ_NEXT(&vnode_free_rover, v_freelist);
649 if (vp == &vnode_free_mid2) {
650 vp = TAILQ_NEXT(vp, v_freelist);
651 rover_state = ROVER_MID2;
652 }
653 KKASSERT(vp != &vnode_free_mid1);
654
655 /*
656 * Start over if we finished the scan.
657 */
658 TAILQ_REMOVE(&vnode_free_list, &vnode_free_rover, v_freelist);
659 if (vp == NULL) {
660 TAILQ_INSERT_AFTER(&vnode_free_list, &vnode_free_mid1,
661 &vnode_free_rover, v_freelist);
662 rover_state = ROVER_MID1;
663 return;
664 }
665 TAILQ_INSERT_AFTER(&vnode_free_list, vp, &vnode_free_rover, v_freelist);
666
667 /*
668 * Shift vp if appropriate.
669 */
670 if (vp->v_object && vp->v_object->resident_page_count) {
671 /*
672 * Promote vnode with resident pages to section 3.
673 * (This case shouldn't happen).
674 */
675 if (rover_state == ROVER_MID1) {
676 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
677 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
678 }
679 } else if (vp->v_object && vp->v_object->swblock_count) {
680 /*
681 * Demote vnode with only swap pages to section 2
682 */
683 if (rover_state == ROVER_MID2) {
684 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
685 TAILQ_INSERT_BEFORE(&vnode_free_mid2, vp, v_freelist);
686 }
687 } else {
688 /*
689 * Demote vnode with no cached data to section 1
690 */
691 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
692 TAILQ_INSERT_BEFORE(&vnode_free_mid1, vp, v_freelist);
693 }
694}
695
696/*
ac88f01a
MD
697 * Try to reuse a vnode from the free list.
698 *
699 * NOTE: The returned vnode is not completely initialized.
700 *
701 * WARNING: The freevnodes count can race, NULL can be returned even if
702 * freevnodes != 0.
2247fe02
MD
703 *
704 * MPSAFE
3c37c940
MD
705 */
706static
707struct vnode *
708allocfreevnode(void)
709{
710 struct vnode *vp;
711 int count;
712
713 for (count = 0; count < freevnodes; count++) {
714 /*
3c37c940
MD
715 * Try to lock the first vnode on the free list.
716 * Cycle if we can't.
298693f7
MD
717 *
718 * We use a bad hack in vx_lock_nonblock() which avoids
719 * the lock order reversal between vfs_spin and v_spinlock.
720 * This is very fragile code and I don't want to use
721 * vhold here.
3c37c940 722 */
287a8577 723 spin_lock(&vfs_spin);
04bd6171
MD
724 vnode_rover_locked();
725 vnode_rover_locked();
3c37c940 726 vp = TAILQ_FIRST(&vnode_free_list);
04bd6171
MD
727 while (vp == &vnode_free_mid1 || vp == &vnode_free_mid2 ||
728 vp == &vnode_free_rover) {
0e8bd897 729 vp = TAILQ_NEXT(vp, v_freelist);
04bd6171
MD
730 }
731 if (vp == NULL)
732 break;
3c37c940
MD
733 if (vx_lock_nonblock(vp)) {
734 KKASSERT(vp->v_flag & VFREE);
735 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
736 TAILQ_INSERT_TAIL(&vnode_free_list,
737 vp, v_freelist);
287a8577 738 spin_unlock(&vfs_spin);
3c37c940
MD
739 continue;
740 }
ac88f01a
MD
741
742 /*
743 * We inherit the sysref associated the vnode on the free
744 * list. Because VCACHED is clear the vnode will not
745 * be placed back on the free list. We own the sysref
746 * free and clear and thus control the disposition of
747 * the vnode.
748 */
749 __vbusy_interlocked(vp);
287a8577 750 spin_unlock(&vfs_spin);
0e8bd897
MD
751#ifdef TRACKVNODE
752 if ((ulong)vp == trackvnode)
753 kprintf("allocfreevnode %p %08x\n", vp, vp->v_flag);
754#endif
18c4feea 755 /*
ac88f01a
MD
756 * Do not reclaim/reuse a vnode while auxillary refs exists.
757 * This includes namecache refs due to a related ncp being
758 * locked or having children.
759 *
760 * We will make this test several times as auxrefs can
761 * get incremented on us without any spinlocks being held
762 * until we have removed all namecache and inode references
763 * to the vnode.
764 *
765 * Because VCACHED is already in the correct state (cleared)
766 * we cannot race other vdrop()s occuring at the same time
767 * and can safely place vp on the free list.
768 *
769 * The free list association reinherits the sysref.
18c4feea
MD
770 */
771 if (vp->v_auxrefs) {
772 __vfreetail(vp);
773 vx_unlock(vp);
774 continue;
775 }
3c37c940
MD
776
777 /*
ac88f01a
MD
778 * We inherit the reference that was previously associated
779 * with the vnode being on the free list. VCACHED had better
780 * not be set because the reference and VX lock prevents
781 * the sysref from transitioning to an active state.
3c37c940 782 */
ac88f01a 783 KKASSERT((vp->v_flag & (VINACTIVE|VCACHED)) == VINACTIVE);
3c37c940 784 KKASSERT(sysref_isinactive(&vp->v_sysref));
3c37c940
MD
785
786 /*
799f78b8
MD
787 * Holding the VX lock on an inactive vnode prevents it
788 * from being reactivated or reused. New namecache
789 * associations can only be made using active vnodes.
790 *
791 * Another thread may be blocked on our vnode lock while
792 * holding a namecache lock. We can only reuse this vnode
793 * if we can clear all namecache associations without
794 * blocking.
ac88f01a
MD
795 *
796 * Because VCACHED is already in the correct state (cleared)
797 * we cannot race other vdrop()s occuring at the same time
798 * and can safely place vp on the free list.
3c37c940 799 */
799f78b8
MD
800 if ((vp->v_flag & VRECLAIMED) == 0) {
801 if (cache_inval_vp_nonblock(vp)) {
802 __vfreetail(vp);
803 vx_unlock(vp);
804 continue;
805 }
3c37c940 806 vgone_vxlocked(vp);
799f78b8
MD
807 /* vnode is still VX locked */
808 }
3c37c940
MD
809
810 /*
811 * We can reuse the vnode if no primary or auxiliary
812 * references remain other then ours, else put it
813 * back on the free list and keep looking.
814 *
815 * Either the free list inherits the last reference
816 * or we fall through and sysref_activate() the last
817 * reference.
799f78b8
MD
818 *
819 * Since the vnode is in a VRECLAIMED state, no new
820 * namecache associations could have been made.
3c37c940 821 */
799f78b8 822 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
3c37c940
MD
823 if (vp->v_auxrefs ||
824 !sysref_islastdeactivation(&vp->v_sysref)) {
825 __vfreetail(vp);
799f78b8 826 vx_unlock(vp);
3c37c940
MD
827 continue;
828 }
5c6c3cac
MD
829
830 /*
799f78b8
MD
831 * Return a VX locked vnode suitable for reuse. The caller
832 * inherits the sysref.
5c6c3cac 833 */
3c37c940
MD
834 return(vp);
835 }
836 return(NULL);
837}
838
839/*
5fd012e0
MD
840 * Obtain a new vnode from the freelist, allocating more if necessary.
841 * The returned vnode is VX locked & refd.
0e8bd897
MD
842 *
843 * All new vnodes set the VAGE flags. An open() of the vnode will
844 * decrement the (2-bit) flags. Vnodes which are opened several times
845 * are thus retained in the cache over vnodes which are merely stat()d.
2247fe02
MD
846 *
847 * MPSAFE
5fd012e0
MD
848 */
849struct vnode *
850allocvnode(int lktimeout, int lkflags)
851{
5fd012e0
MD
852 struct vnode *vp;
853
854 /*
855 * Try to reuse vnodes if we hit the max. This situation only
856 * occurs in certain large-memory (2G+) situations. We cannot
857 * attempt to directly reclaim vnodes due to nasty recursion
858 * problems.
859 */
860 while (numvnodes - freevnodes > desiredvnodes)
861 vnlru_proc_wait();
862
5fd012e0 863 /*
e92ca23a
MD
864 * Try to build up as many vnodes as we can before reallocating
865 * from the free list. A vnode on the free list simply means
866 * that it is inactive with no resident pages. It may or may not
867 * have been reclaimed and could have valuable information associated
868 * with it that we shouldn't throw away unless we really need to.
869 *
870 * HAMMER NOTE: Re-establishing a vnode is a fairly expensive
871 * operation for HAMMER but this should benefit UFS as well.
5fd012e0 872 */
e92ca23a 873 if (freevnodes >= wantfreevnodes && numvnodes >= desiredvnodes)
3c37c940
MD
874 vp = allocfreevnode();
875 else
876 vp = NULL;
877 if (vp == NULL) {
878 vp = sysref_alloc(&vnode_sysref_class);
4f51b8ae 879 KKASSERT((vp->v_flag & (VCACHED|VFREE)) == 0);
3c37c940
MD
880 lockmgr(&vp->v_lock, LK_EXCLUSIVE);
881 numvnodes++;
5fd012e0
MD
882 }
883
884 /*
3c37c940
MD
885 * We are using a managed sysref class, vnode fields are only
886 * zerod on initial allocation from the backing store, not
887 * on reallocation. Thus we have to clear these fields for both
888 * reallocation and reuse.
5fd012e0 889 */
5fd012e0 890#ifdef INVARIANTS
3c37c940
MD
891 if (vp->v_data)
892 panic("cleaned vnode isn't");
a9a20f98
MD
893 if (bio_track_active(&vp->v_track_read) ||
894 bio_track_active(&vp->v_track_write)) {
3c37c940 895 panic("Clean vnode has pending I/O's");
a9a20f98 896 }
8f7279b9
MD
897 if (vp->v_flag & VONWORKLST)
898 panic("Clean vnode still pending on syncer worklist!");
899 if (!RB_EMPTY(&vp->v_rbdirty_tree))
900 panic("Clean vnode still has dirty buffers!");
901 if (!RB_EMPTY(&vp->v_rbclean_tree))
902 panic("Clean vnode still has clean buffers!");
903 if (!RB_EMPTY(&vp->v_rbhash_tree))
904 panic("Clean vnode still on hash tree!");
3c37c940 905 KKASSERT(vp->v_mount == NULL);
5fd012e0 906#endif
0e8bd897 907 vp->v_flag = VAGE0 | VAGE1;
3c37c940
MD
908 vp->v_lastw = 0;
909 vp->v_lasta = 0;
910 vp->v_cstart = 0;
911 vp->v_clen = 0;
912 vp->v_socket = 0;
913 vp->v_opencount = 0;
914 vp->v_writecount = 0; /* XXX */
f043c4c7
MD
915
916 /*
917 * lktimeout only applies when LK_TIMELOCK is used, and only
918 * the pageout daemon uses it. The timeout may not be zero
919 * or the pageout daemon can deadlock in low-VM situations.
920 */
921 if (lktimeout == 0)
922 lktimeout = hz / 10;
3c37c940 923 lockreinit(&vp->v_lock, "vnode", lktimeout, lkflags);
2779ba31 924 KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
3c37c940 925 /* exclusive lock still held */
5fd012e0 926
3c37c940
MD
927 /*
928 * Note: sysref needs to be activated to convert -0x40000000 to +1.
929 * The -0x40000000 comes from the last ref on reuse, and from
930 * sysref_init() on allocate.
931 */
932 sysref_activate(&vp->v_sysref);
57f7b636 933 vp->v_filesize = NOOFFSET;
5fd012e0
MD
934 vp->v_type = VNON;
935 vp->v_tag = 0;
936 vp->v_ops = NULL;
937 vp->v_data = NULL;
938 KKASSERT(vp->v_mount == NULL);
3c37c940 939
5fd012e0
MD
940 return (vp);
941}
942
2247fe02
MD
943/*
944 * MPSAFE
945 */
3c37c940
MD
946int
947freesomevnodes(int n)
948{
949 struct vnode *vp;
950 int count = 0;
951
952 while (n) {
953 --n;
954 if ((vp = allocfreevnode()) == NULL)
955 break;
956 vx_put(vp);
957 --numvnodes;
958 }
959 return(count);
960}