Commit | Line | Data |
---|---|---|
5fd012e0 | 1 | /* |
8938f217 | 2 | * Copyright (c) 2004,2013-2022 The DragonFly Project. All rights reserved. |
5fd012e0 MD |
3 | * |
4 | * This code is derived from software contributed to The DragonFly Project | |
5 | * by Matthew Dillon <dillon@backplane.com> | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * 1. Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * 2. Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific, prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 | * SUCH DAMAGE. | |
5fd012e0 MD |
33 | */ |
34 | ||
35 | /* | |
ee173d09 SW |
36 | * External lock/ref-related vnode functions |
37 | * | |
38 | * vs_state transition locking requirements: | |
39 | * | |
124072b8 | 40 | * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin |
ee173d09 | 41 | * DYING -> CACHED vx_lock(excl) |
124072b8 MD |
42 | * ACTIVE -> INACTIVE (none) + v_spin + vi->spin |
43 | * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin | |
44 | * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin | |
ee173d09 | 45 | * |
124072b8 | 46 | * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, |
ee173d09 SW |
47 | * |
48 | * Switching into ACTIVE also requires a vref and vnode lock, however | |
49 | * the vnode lock is allowed to be SHARED. | |
50 | * | |
51 | * Switching into a CACHED or DYING state requires an exclusive vnode | |
fc36a10b | 52 | * lock or vx_lock (which is almost the same thing but not quite). |
5fd012e0 | 53 | */ |
5fd012e0 MD |
54 | |
55 | #include <sys/param.h> | |
56 | #include <sys/systm.h> | |
57 | #include <sys/kernel.h> | |
58 | #include <sys/malloc.h> | |
59 | #include <sys/mount.h> | |
60 | #include <sys/proc.h> | |
61 | #include <sys/vnode.h> | |
120385e8 | 62 | #include <sys/spinlock2.h> |
5fd012e0 MD |
63 | #include <sys/sysctl.h> |
64 | ||
65 | #include <machine/limits.h> | |
66 | ||
67 | #include <vm/vm.h> | |
68 | #include <vm/vm_object.h> | |
69 | ||
cb3297fb MD |
70 | #define VACT_MAX 10 |
71 | #define VACT_INC 2 | |
2097b3da | 72 | |
3c37c940 | 73 | static void vnode_terminate(struct vnode *vp); |
5fd012e0 | 74 | |
e9dbfea1 MD |
75 | static MALLOC_DEFINE_OBJ(M_VNODE, sizeof(struct vnode), "vnodes", "vnodes"); |
76 | static MALLOC_DEFINE(M_VNODE_HASH, "vnodelsthash", "vnode list hash"); | |
5fd012e0 | 77 | |
0e8bd897 MD |
78 | /* |
79 | * The vnode free list hold inactive vnodes. Aged inactive vnodes | |
80 | * are inserted prior to the mid point, and otherwise inserted | |
81 | * at the tail. | |
124072b8 MD |
82 | * |
83 | * The vnode code goes to great lengths to avoid moving vnodes between | |
84 | * lists, but sometimes it is unavoidable. For this situation we try to | |
85 | * avoid lock contention but we do not try very hard to avoid cache line | |
86 | * congestion. A modestly sized hash table is used. | |
0e8bd897 | 87 | */ |
124072b8 MD |
88 | #define VLIST_PRIME2 123462047LU |
89 | #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU | |
90 | ||
91 | #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ | |
92 | VLIST_PRIME2 % (unsigned)ncpus) | |
93 | ||
124072b8 | 94 | static struct vnode_index *vnode_list_hash; |
5fd012e0 | 95 | |
ee173d09 SW |
96 | int activevnodes = 0; |
97 | SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, | |
98 | &activevnodes, 0, "Number of active nodes"); | |
99 | int cachedvnodes = 0; | |
100 | SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, | |
101 | &cachedvnodes, 0, "Number of total cached nodes"); | |
102 | int inactivevnodes = 0; | |
103 | SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, | |
104 | &inactivevnodes, 0, "Number of inactive nodes"); | |
62ae46c9 MD |
105 | static int batchfreevnodes = 5; |
106 | SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, | |
107 | &batchfreevnodes, 0, "Number of vnodes to free at once"); | |
8938f217 MD |
108 | |
109 | static long auxrecovervnodes1; | |
110 | SYSCTL_INT(_debug, OID_AUTO, auxrecovervnodes1, CTLFLAG_RW, | |
111 | &auxrecovervnodes1, 0, "vnlru auxillary vnodes recovered"); | |
112 | static long auxrecovervnodes2; | |
113 | SYSCTL_INT(_debug, OID_AUTO, auxrecovervnodes2, CTLFLAG_RW, | |
114 | &auxrecovervnodes2, 0, "vnlru auxillary vnodes recovered"); | |
115 | ||
0e8bd897 | 116 | #ifdef TRACKVNODE |
1028a5f8 | 117 | static u_long trackvnode; |
0e8bd897 MD |
118 | SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, |
119 | &trackvnode, 0, ""); | |
120 | #endif | |
5fd012e0 MD |
121 | |
122 | /* | |
123 | * Called from vfsinit() | |
124 | */ | |
125 | void | |
126 | vfs_lock_init(void) | |
127 | { | |
124072b8 MD |
128 | int i; |
129 | ||
e9dbfea1 | 130 | kmalloc_obj_raise_limit(M_VNODE, 0); /* unlimited */ |
124072b8 | 131 | vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, |
e9dbfea1 | 132 | M_VNODE_HASH, M_ZERO | M_WAITOK); |
124072b8 MD |
133 | for (i = 0; i < ncpus; ++i) { |
134 | struct vnode_index *vi = &vnode_list_hash[i]; | |
135 | ||
136 | TAILQ_INIT(&vi->inactive_list); | |
137 | TAILQ_INIT(&vi->active_list); | |
138 | TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); | |
139 | spin_init(&vi->spin, "vfslock"); | |
140 | } | |
5fd012e0 MD |
141 | } |
142 | ||
143 | /* | |
2247fe02 MD |
144 | * Misc functions |
145 | */ | |
146 | static __inline | |
147 | void | |
148 | _vsetflags(struct vnode *vp, int flags) | |
149 | { | |
150 | atomic_set_int(&vp->v_flag, flags); | |
151 | } | |
152 | ||
153 | static __inline | |
154 | void | |
155 | _vclrflags(struct vnode *vp, int flags) | |
156 | { | |
157 | atomic_clear_int(&vp->v_flag, flags); | |
158 | } | |
159 | ||
160 | void | |
161 | vsetflags(struct vnode *vp, int flags) | |
162 | { | |
163 | _vsetflags(vp, flags); | |
164 | } | |
165 | ||
166 | void | |
167 | vclrflags(struct vnode *vp, int flags) | |
168 | { | |
169 | _vclrflags(vp, flags); | |
170 | } | |
171 | ||
172 | /* | |
2097b3da | 173 | * Place the vnode on the active list. |
2247fe02 | 174 | * |
2097b3da | 175 | * Caller must hold vp->v_spin |
5fd012e0 MD |
176 | */ |
177 | static __inline | |
178 | void | |
ee173d09 | 179 | _vactivate(struct vnode *vp) |
5fd012e0 | 180 | { |
124072b8 MD |
181 | struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; |
182 | ||
0e8bd897 | 183 | #ifdef TRACKVNODE |
1028a5f8 | 184 | if ((u_long)vp == trackvnode) |
ee173d09 | 185 | kprintf("_vactivate %p %08x\n", vp, vp->v_flag); |
0e8bd897 | 186 | #endif |
124072b8 | 187 | spin_lock(&vi->spin); |
2097b3da MD |
188 | |
189 | switch(vp->v_state) { | |
190 | case VS_ACTIVE: | |
124072b8 | 191 | spin_unlock(&vi->spin); |
2097b3da MD |
192 | panic("_vactivate: already active"); |
193 | /* NOT REACHED */ | |
2097b3da MD |
194 | return; |
195 | case VS_INACTIVE: | |
124072b8 | 196 | TAILQ_REMOVE(&vi->inactive_list, vp, v_list); |
93e77488 | 197 | atomic_add_int(&mycpu->gd_inactivevnodes, -1); |
2097b3da MD |
198 | break; |
199 | case VS_CACHED: | |
200 | case VS_DYING: | |
201 | break; | |
ee173d09 | 202 | } |
124072b8 | 203 | TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); |
ee173d09 | 204 | vp->v_state = VS_ACTIVE; |
124072b8 | 205 | spin_unlock(&vi->spin); |
93e77488 | 206 | atomic_add_int(&mycpu->gd_activevnodes, 1); |
5fd012e0 MD |
207 | } |
208 | ||
2247fe02 | 209 | /* |
2097b3da | 210 | * Put a vnode on the inactive list. |
2247fe02 | 211 | * |
ee173d09 | 212 | * Caller must hold v_spin |
2247fe02 | 213 | */ |
5fd012e0 MD |
214 | static __inline |
215 | void | |
ee173d09 | 216 | _vinactive(struct vnode *vp) |
5fd012e0 | 217 | { |
124072b8 MD |
218 | struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; |
219 | ||
0e8bd897 | 220 | #ifdef TRACKVNODE |
1028a5f8 | 221 | if ((u_long)vp == trackvnode) { |
ee173d09 | 222 | kprintf("_vinactive %p %08x\n", vp, vp->v_flag); |
7ce2998e | 223 | print_backtrace(-1); |
0e8bd897 MD |
224 | } |
225 | #endif | |
124072b8 | 226 | spin_lock(&vi->spin); |
2097b3da MD |
227 | |
228 | /* | |
229 | * Remove from active list if it is sitting on it | |
230 | */ | |
231 | switch(vp->v_state) { | |
232 | case VS_ACTIVE: | |
124072b8 | 233 | TAILQ_REMOVE(&vi->active_list, vp, v_list); |
93e77488 | 234 | atomic_add_int(&mycpu->gd_activevnodes, -1); |
2097b3da MD |
235 | break; |
236 | case VS_INACTIVE: | |
124072b8 | 237 | spin_unlock(&vi->spin); |
2097b3da MD |
238 | panic("_vinactive: already inactive"); |
239 | /* NOT REACHED */ | |
2097b3da MD |
240 | return; |
241 | case VS_CACHED: | |
242 | case VS_DYING: | |
243 | break; | |
244 | } | |
04bd6171 MD |
245 | |
246 | /* | |
247 | * Distinguish between basically dead vnodes, vnodes with cached | |
248 | * data, and vnodes without cached data. A rover will shift the | |
249 | * vnodes around as their cache status is lost. | |
250 | */ | |
251 | if (vp->v_flag & VRECLAIMED) { | |
124072b8 | 252 | TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); |
04bd6171 | 253 | } else { |
124072b8 | 254 | TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); |
04bd6171 | 255 | } |
ee173d09 | 256 | vp->v_state = VS_INACTIVE; |
124072b8 | 257 | spin_unlock(&vi->spin); |
93e77488 | 258 | atomic_add_int(&mycpu->gd_inactivevnodes, 1); |
5fd012e0 MD |
259 | } |
260 | ||
5b287bba | 261 | /* |
3c37c940 | 262 | * Add a ref to an active vnode. This function should never be called |
ee173d09 SW |
263 | * with an inactive vnode (use vget() instead), but might be called |
264 | * with other states. | |
5b287bba | 265 | */ |
5fd012e0 MD |
266 | void |
267 | vref(struct vnode *vp) | |
44b1cf3d | 268 | { |
ee173d09 SW |
269 | KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), |
270 | ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); | |
271 | atomic_add_int(&vp->v_refcnt, 1); | |
44b1cf3d MD |
272 | } |
273 | ||
4acc6b1f MD |
274 | void |
275 | vref_special(struct vnode *vp) | |
276 | { | |
277 | if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) | |
278 | atomic_add_int(&mycpu->gd_cachedvnodes, -1); | |
279 | } | |
280 | ||
93e77488 MD |
281 | void |
282 | synchronizevnodecount(void) | |
283 | { | |
284 | int nca = 0; | |
285 | int act = 0; | |
286 | int ina = 0; | |
287 | int i; | |
288 | ||
289 | for (i = 0; i < ncpus; ++i) { | |
290 | globaldata_t gd = globaldata_find(i); | |
291 | nca += gd->gd_cachedvnodes; | |
292 | act += gd->gd_activevnodes; | |
293 | ina += gd->gd_inactivevnodes; | |
294 | } | |
295 | cachedvnodes = nca; | |
296 | activevnodes = act; | |
297 | inactivevnodes = ina; | |
298 | } | |
299 | ||
3536c341 MD |
300 | /* |
301 | * Count number of cached vnodes. This is middling expensive so be | |
93e77488 MD |
302 | * careful not to make this call in the critical path. Each cpu tracks |
303 | * its own accumulator. The individual accumulators must be summed | |
304 | * together to get an accurate value. | |
3536c341 MD |
305 | */ |
306 | int | |
93e77488 | 307 | countcachedvnodes(void) |
3536c341 MD |
308 | { |
309 | int i; | |
310 | int n = 0; | |
311 | ||
312 | for (i = 0; i < ncpus; ++i) { | |
313 | globaldata_t gd = globaldata_find(i); | |
314 | n += gd->gd_cachedvnodes; | |
315 | } | |
93e77488 MD |
316 | return n; |
317 | } | |
318 | ||
319 | int | |
320 | countcachedandinactivevnodes(void) | |
321 | { | |
322 | int i; | |
323 | int n = 0; | |
324 | ||
325 | for (i = 0; i < ncpus; ++i) { | |
326 | globaldata_t gd = globaldata_find(i); | |
327 | n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; | |
328 | } | |
3536c341 MD |
329 | return n; |
330 | } | |
331 | ||
44b1cf3d | 332 | /* |
ee173d09 SW |
333 | * Release a ref on an active or inactive vnode. |
334 | * | |
2097b3da MD |
335 | * Caller has no other requirements. |
336 | * | |
ee173d09 SW |
337 | * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 |
338 | * transition, otherwise we leave the vnode in the active list and | |
339 | * do a lockless transition to 0, which is very important for the | |
340 | * critical path. | |
2097b3da MD |
341 | * |
342 | * (vrele() is not called when a vnode is being destroyed w/kfree) | |
3c37c940 MD |
343 | */ |
344 | void | |
345 | vrele(struct vnode *vp) | |
346 | { | |
a7c16d7a MD |
347 | int count; |
348 | ||
6b608642 | 349 | #if 1 |
a7c16d7a MD |
350 | count = vp->v_refcnt; |
351 | cpu_ccfence(); | |
352 | ||
ee173d09 | 353 | for (;;) { |
ee173d09 | 354 | KKASSERT((count & VREF_MASK) > 0); |
2097b3da MD |
355 | KKASSERT(vp->v_state == VS_ACTIVE || |
356 | vp->v_state == VS_INACTIVE); | |
ee173d09 SW |
357 | |
358 | /* | |
359 | * 2+ case | |
360 | */ | |
361 | if ((count & VREF_MASK) > 1) { | |
a7c16d7a MD |
362 | if (atomic_fcmpset_int(&vp->v_refcnt, |
363 | &count, count - 1)) { | |
ee173d09 | 364 | break; |
a7c16d7a | 365 | } |
ee173d09 SW |
366 | continue; |
367 | } | |
368 | ||
369 | /* | |
370 | * 1->0 transition case must handle possible finalization. | |
371 | * When finalizing we transition 1->0x40000000. Note that | |
372 | * cachedvnodes is only adjusted on transitions to ->0. | |
2097b3da MD |
373 | * |
374 | * WARNING! VREF_TERMINATE can be cleared at any point | |
375 | * when the refcnt is non-zero (by vget()) and | |
376 | * the vnode has not been reclaimed. Thus | |
377 | * transitions out of VREF_TERMINATE do not have | |
378 | * to mess with cachedvnodes. | |
ee173d09 SW |
379 | */ |
380 | if (count & VREF_FINALIZE) { | |
381 | vx_lock(vp); | |
a7c16d7a MD |
382 | if (atomic_fcmpset_int(&vp->v_refcnt, |
383 | &count, VREF_TERMINATE)) { | |
ee173d09 SW |
384 | vnode_terminate(vp); |
385 | break; | |
386 | } | |
387 | vx_unlock(vp); | |
388 | } else { | |
a7c16d7a | 389 | if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) { |
3536c341 | 390 | atomic_add_int(&mycpu->gd_cachedvnodes, 1); |
ee173d09 SW |
391 | break; |
392 | } | |
393 | } | |
a7c16d7a | 394 | cpu_pause(); |
ee173d09 SW |
395 | /* retry */ |
396 | } | |
6b608642 MD |
397 | #else |
398 | /* | |
399 | * XXX NOT YET WORKING! Multiple threads can reference the vnode | |
400 | * after dropping their count, racing destruction, because this | |
401 | * code is not directly transitioning from 1->VREF_FINALIZE. | |
402 | */ | |
403 | /* | |
404 | * Drop the ref-count. On the 1->0 transition we check VREF_FINALIZE | |
405 | * and attempt to acquire VREF_TERMINATE if set. It is possible for | |
406 | * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but | |
407 | * only one will be able to transition the vnode into the | |
408 | * VREF_TERMINATE state. | |
409 | * | |
410 | * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter | |
411 | * this state once. | |
412 | */ | |
413 | count = atomic_fetchadd_int(&vp->v_refcnt, -1); | |
414 | if ((count & VREF_MASK) == 1) { | |
415 | atomic_add_int(&mycpu->gd_cachedvnodes, 1); | |
416 | --count; | |
417 | while ((count & (VREF_MASK | VREF_FINALIZE)) == VREF_FINALIZE) { | |
418 | vx_lock(vp); | |
419 | if (atomic_fcmpset_int(&vp->v_refcnt, | |
420 | &count, VREF_TERMINATE)) { | |
421 | atomic_add_int(&mycpu->gd_cachedvnodes, -1); | |
422 | vnode_terminate(vp); | |
423 | break; | |
424 | } | |
425 | vx_unlock(vp); | |
426 | } | |
427 | } | |
428 | #endif | |
3c37c940 MD |
429 | } |
430 | ||
431 | /* | |
432 | * Add an auxiliary data structure reference to the vnode. Auxiliary | |
2097b3da MD |
433 | * references do not change the state of the vnode or prevent deactivation |
434 | * or reclamation of the vnode, but will prevent the vnode from being | |
435 | * destroyed (kfree()'d). | |
b1c20cfa | 436 | * |
ee173d09 SW |
437 | * WARNING! vhold() must not acquire v_spin. The spinlock may or may not |
438 | * already be held by the caller. vdrop() will clean up the | |
439 | * free list state. | |
44b1cf3d MD |
440 | */ |
441 | void | |
3c37c940 | 442 | vhold(struct vnode *vp) |
ac88f01a MD |
443 | { |
444 | atomic_add_int(&vp->v_auxrefs, 1); | |
445 | } | |
446 | ||
e3332475 | 447 | /* |
3c37c940 | 448 | * Remove an auxiliary reference from the vnode. |
e3332475 | 449 | */ |
5fd012e0 | 450 | void |
3c37c940 | 451 | vdrop(struct vnode *vp) |
5fd012e0 | 452 | { |
2097b3da | 453 | atomic_add_int(&vp->v_auxrefs, -1); |
5fd012e0 MD |
454 | } |
455 | ||
bc8993d7 MD |
456 | /* |
457 | * Set VREF_FINALIZE to request that the vnode be inactivated | |
458 | * as soon as possible (on the 1->0 transition of its refs). | |
459 | * | |
460 | * Caller must have a ref on the vnode. | |
461 | * | |
462 | * This function has no effect if the vnode is already in termination | |
463 | * processing. | |
464 | */ | |
465 | void | |
466 | vfinalize(struct vnode *vp) | |
467 | { | |
468 | if ((vp->v_refcnt & VREF_MASK) > 0) | |
469 | atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); | |
470 | } | |
471 | ||
5fd012e0 | 472 | /* |
2097b3da MD |
473 | * This function is called on the 1->0 transition (which is actually |
474 | * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation | |
475 | * of the vnode. | |
5fd012e0 | 476 | * |
ee173d09 | 477 | * Additional vrefs are allowed to race but will not result in a reentrant |
2097b3da MD |
478 | * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This |
479 | * prevents additional 1->0 transitions. | |
480 | * | |
481 | * ONLY A VGET() CAN REACTIVATE THE VNODE. | |
482 | * | |
483 | * Caller must hold the VX lock. | |
2247fe02 | 484 | * |
ee173d09 SW |
485 | * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops |
486 | * | |
487 | * NOTE: The vnode may be marked inactive with dirty buffers | |
488 | * or dirty pages in its cached VM object still present. | |
489 | * | |
490 | * NOTE: VS_FREE should not be set on entry (the vnode was expected to | |
491 | * previously be active). We lose control of the vnode the instant | |
492 | * it is placed on the free list. | |
493 | * | |
494 | * The VX lock is required when transitioning to VS_CACHED but is | |
495 | * not sufficient for the vshouldfree() interlocked test or when | |
496 | * transitioning away from VS_CACHED. v_spin is also required for | |
497 | * those cases. | |
5fd012e0 | 498 | */ |
2097b3da | 499 | static |
5fd012e0 | 500 | void |
3c37c940 | 501 | vnode_terminate(struct vnode *vp) |
5fd012e0 | 502 | { |
ee173d09 | 503 | KKASSERT(vp->v_state == VS_ACTIVE); |
e654922c | 504 | |
e654922c MD |
505 | if ((vp->v_flag & VINACTIVE) == 0) { |
506 | _vsetflags(vp, VINACTIVE); | |
507 | if (vp->v_mount) | |
508 | VOP_INACTIVE(vp); | |
3c37c940 | 509 | } |
b12defdc | 510 | spin_lock(&vp->v_spin); |
2097b3da | 511 | _vinactive(vp); |
b12defdc | 512 | spin_unlock(&vp->v_spin); |
2097b3da | 513 | |
e654922c | 514 | vx_unlock(vp); |
5fd012e0 MD |
515 | } |
516 | ||
5fd012e0 MD |
517 | /**************************************************************** |
518 | * VX LOCKING FUNCTIONS * | |
519 | **************************************************************** | |
520 | * | |
44b1cf3d | 521 | * These functions lock vnodes for reclamation and deactivation related |
3c37c940 MD |
522 | * activities. The caller must already be holding some sort of reference |
523 | * on the vnode. | |
5fd012e0 | 524 | */ |
e3332475 | 525 | void |
5fd012e0 MD |
526 | vx_lock(struct vnode *vp) |
527 | { | |
e3332475 | 528 | lockmgr(&vp->v_lock, LK_EXCLUSIVE); |
fc36a10b | 529 | spin_lock_update_only(&vp->v_spin); |
5fd012e0 MD |
530 | } |
531 | ||
e3332475 | 532 | void |
3c37c940 | 533 | vx_unlock(struct vnode *vp) |
5fd012e0 | 534 | { |
fc36a10b | 535 | spin_unlock_update_only(&vp->v_spin); |
e3332475 | 536 | lockmgr(&vp->v_lock, LK_RELEASE); |
5fd012e0 MD |
537 | } |
538 | ||
fc36a10b MD |
539 | /* |
540 | * Downgrades a VX lock to a normal VN lock. The lock remains EXCLUSIVE. | |
541 | * | |
542 | * Generally required after calling getnewvnode() if the intention is | |
543 | * to return a normal locked vnode to the caller. | |
544 | */ | |
545 | void | |
546 | vx_downgrade(struct vnode *vp) | |
547 | { | |
548 | spin_unlock_update_only(&vp->v_spin); | |
549 | } | |
550 | ||
5fd012e0 MD |
551 | /**************************************************************** |
552 | * VNODE ACQUISITION FUNCTIONS * | |
553 | **************************************************************** | |
554 | * | |
ee173d09 SW |
555 | * These functions must be used when accessing a vnode that has no |
556 | * chance of being destroyed in a SMP race. That means the caller will | |
557 | * usually either hold an auxiliary reference (such as the namecache) | |
558 | * or hold some other lock that ensures that the vnode cannot be destroyed. | |
5fd012e0 | 559 | * |
3c37c940 MD |
560 | * These functions are MANDATORY for any code chain accessing a vnode |
561 | * whos activation state is not known. | |
632d9efa | 562 | * |
2247fe02 MD |
563 | * vget() can be called with LK_NOWAIT and will return EBUSY if the |
564 | * lock cannot be immediately acquired. | |
565 | * | |
3c37c940 | 566 | * vget()/vput() are used when reactivation is desired. |
5fd012e0 | 567 | * |
3c37c940 | 568 | * vx_get() and vx_put() are used when reactivation is not desired. |
5fd012e0 MD |
569 | */ |
570 | int | |
87de5057 | 571 | vget(struct vnode *vp, int flags) |
5fd012e0 MD |
572 | { |
573 | int error; | |
574 | ||
3c37c940 MD |
575 | /* |
576 | * A lock type must be passed | |
577 | */ | |
578 | if ((flags & LK_TYPE_MASK) == 0) { | |
579 | panic("vget() called with no lock specified!"); | |
580 | /* NOT REACHED */ | |
581 | } | |
582 | ||
583 | /* | |
ee173d09 | 584 | * Reference the structure and then acquire the lock. |
b0911300 MD |
585 | * |
586 | * NOTE: The requested lock might be a shared lock and does | |
587 | * not protect our access to the refcnt or other fields. | |
3c37c940 | 588 | */ |
2097b3da | 589 | if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) |
3536c341 | 590 | atomic_add_int(&mycpu->gd_cachedvnodes, -1); |
ee173d09 | 591 | |
b458d1ab | 592 | if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { |
3c37c940 | 593 | /* |
ee173d09 SW |
594 | * The lock failed, undo and return an error. This will not |
595 | * normally trigger a termination. | |
3c37c940 | 596 | */ |
ee173d09 | 597 | vrele(vp); |
3c37c940 MD |
598 | } else if (vp->v_flag & VRECLAIMED) { |
599 | /* | |
600 | * The node is being reclaimed and cannot be reactivated | |
601 | * any more, undo and return ENOENT. | |
602 | */ | |
603 | vn_unlock(vp); | |
604 | vrele(vp); | |
605 | error = ENOENT; | |
ee173d09 SW |
606 | } else if (vp->v_state == VS_ACTIVE) { |
607 | /* | |
608 | * A VS_ACTIVE vnode coupled with the fact that we have | |
609 | * a vnode lock (even if shared) prevents v_state from | |
610 | * changing. Since the vnode is not in a VRECLAIMED state, | |
611 | * we can safely clear VINACTIVE. | |
612 | * | |
74272eaf MD |
613 | * It is possible for a shared lock to cause a race with |
614 | * another thread that is also in the process of clearing | |
615 | * VREF_TERMINATE, meaning that we might return with it still | |
616 | * set and then assert in a later vref(). The solution is to | |
617 | * unconditionally clear VREF_TERMINATE here as well. | |
618 | * | |
ee173d09 SW |
619 | * NOTE! Multiple threads may clear VINACTIVE if this is |
620 | * shared lock. This race is allowed. | |
621 | */ | |
6b608642 MD |
622 | if (vp->v_flag & VINACTIVE) |
623 | _vclrflags(vp, VINACTIVE); /* SMP race ok */ | |
624 | if (vp->v_act < VACT_MAX) { | |
625 | vp->v_act += VACT_INC; | |
626 | if (vp->v_act > VACT_MAX) /* SMP race ok */ | |
627 | vp->v_act = VACT_MAX; | |
628 | } | |
ee173d09 | 629 | error = 0; |
6b608642 MD |
630 | if (vp->v_refcnt & VREF_TERMINATE) /* SMP race ok */ |
631 | atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); | |
3c37c940 MD |
632 | } else { |
633 | /* | |
ee173d09 SW |
634 | * If the vnode is not VS_ACTIVE it must be reactivated |
635 | * in addition to clearing VINACTIVE. An exclusive spin_lock | |
636 | * is needed to manipulate the vnode's list. | |
3c37c940 | 637 | * |
ee173d09 SW |
638 | * Because the lockmgr lock might be shared, we might race |
639 | * another reactivation, which we handle. In this situation, | |
640 | * however, the refcnt prevents other v_state races. | |
b0911300 | 641 | * |
ee173d09 SW |
642 | * As with above, clearing VINACTIVE is allowed to race other |
643 | * clearings of VINACTIVE. | |
2097b3da MD |
644 | * |
645 | * VREF_TERMINATE and VREF_FINALIZE can only be cleared when | |
646 | * the refcnt is non-zero and the vnode has not been | |
647 | * reclaimed. This also means that the transitions do | |
648 | * not affect cachedvnodes. | |
74272eaf MD |
649 | * |
650 | * It is possible for a shared lock to cause a race with | |
651 | * another thread that is also in the process of clearing | |
652 | * VREF_TERMINATE, meaning that we might return with it still | |
653 | * set and then assert in a later vref(). The solution is to | |
654 | * unconditionally clear VREF_TERMINATE here as well. | |
3c37c940 | 655 | */ |
ee173d09 | 656 | _vclrflags(vp, VINACTIVE); |
cb3297fb MD |
657 | vp->v_act += VACT_INC; |
658 | if (vp->v_act > VACT_MAX) /* SMP race ok */ | |
2097b3da | 659 | vp->v_act = VACT_MAX; |
b12defdc | 660 | spin_lock(&vp->v_spin); |
ee173d09 SW |
661 | |
662 | switch(vp->v_state) { | |
663 | case VS_INACTIVE: | |
664 | _vactivate(vp); | |
2097b3da MD |
665 | atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | |
666 | VREF_FINALIZE); | |
b12defdc | 667 | spin_unlock(&vp->v_spin); |
ee173d09 SW |
668 | break; |
669 | case VS_CACHED: | |
670 | _vactivate(vp); | |
2097b3da MD |
671 | atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | |
672 | VREF_FINALIZE); | |
b12defdc | 673 | spin_unlock(&vp->v_spin); |
ee173d09 SW |
674 | break; |
675 | case VS_ACTIVE: | |
74272eaf MD |
676 | atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE | |
677 | VREF_TERMINATE); | |
ee173d09 SW |
678 | spin_unlock(&vp->v_spin); |
679 | break; | |
680 | case VS_DYING: | |
b12defdc | 681 | spin_unlock(&vp->v_spin); |
ee173d09 SW |
682 | panic("Impossible VS_DYING state"); |
683 | break; | |
5fd012e0 | 684 | } |
3c37c940 | 685 | error = 0; |
5fd012e0 | 686 | } |
5fd012e0 MD |
687 | return(error); |
688 | } | |
689 | ||
ead16d5b MD |
690 | #ifdef DEBUG_VPUT |
691 | ||
692 | void | |
693 | debug_vput(struct vnode *vp, const char *filename, int line) | |
694 | { | |
695 | kprintf("vput(%p) %s:%d\n", vp, filename, line); | |
696 | vn_unlock(vp); | |
697 | vrele(vp); | |
698 | } | |
699 | ||
700 | #else | |
701 | ||
5fd012e0 MD |
702 | void |
703 | vput(struct vnode *vp) | |
704 | { | |
a11aaa81 | 705 | vn_unlock(vp); |
5fd012e0 MD |
706 | vrele(vp); |
707 | } | |
708 | ||
ead16d5b MD |
709 | #endif |
710 | ||
3c37c940 | 711 | /* |
ee173d09 | 712 | * Acquire the vnode lock unguarded. |
2247fe02 | 713 | * |
cb3297fb MD |
714 | * The non-blocking version also uses a slightly different mechanic. |
715 | * This function will explicitly fail not only if it cannot acquire | |
716 | * the lock normally, but also if the caller already holds a lock. | |
717 | * | |
718 | * The adjusted mechanic is used to close a loophole where complex | |
719 | * VOP_RECLAIM code can circle around recursively and allocate the | |
720 | * same vnode it is trying to destroy from the freelist. | |
721 | * | |
722 | * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can | |
723 | * cause the incorrect behavior to occur. If not for that lockmgr() | |
724 | * would do the right thing. | |
725 | * | |
ee173d09 | 726 | * XXX The vx_*() locks should use auxrefs, not the main reference counter. |
3c37c940 MD |
727 | */ |
728 | void | |
729 | vx_get(struct vnode *vp) | |
730 | { | |
2097b3da | 731 | if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) |
3536c341 | 732 | atomic_add_int(&mycpu->gd_cachedvnodes, -1); |
3c37c940 | 733 | lockmgr(&vp->v_lock, LK_EXCLUSIVE); |
fc36a10b | 734 | spin_lock_update_only(&vp->v_spin); |
3c37c940 MD |
735 | } |
736 | ||
737 | int | |
738 | vx_get_nonblock(struct vnode *vp) | |
739 | { | |
740 | int error; | |
741 | ||
3b6a19b2 | 742 | if (lockinuse(&vp->v_lock)) |
2097b3da | 743 | return(EBUSY); |
3c37c940 | 744 | error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); |
ee173d09 | 745 | if (error == 0) { |
fc36a10b | 746 | spin_lock_update_only(&vp->v_spin); |
2097b3da | 747 | if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) |
3536c341 | 748 | atomic_add_int(&mycpu->gd_cachedvnodes, -1); |
ee173d09 | 749 | } |
3c37c940 MD |
750 | return(error); |
751 | } | |
752 | ||
753 | /* | |
2097b3da MD |
754 | * Release a VX lock that also held a ref on the vnode. vrele() will handle |
755 | * any needed state transitions. | |
3c37c940 | 756 | * |
2097b3da MD |
757 | * However, filesystems use this function to get rid of unwanted new vnodes |
758 | * so try to get the vnode on the correct queue in that case. | |
3c37c940 MD |
759 | */ |
760 | void | |
761 | vx_put(struct vnode *vp) | |
762 | { | |
2097b3da MD |
763 | if (vp->v_type == VNON || vp->v_type == VBAD) |
764 | atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); | |
fc36a10b | 765 | spin_unlock_update_only(&vp->v_spin); |
3c37c940 | 766 | lockmgr(&vp->v_lock, LK_RELEASE); |
ee173d09 | 767 | vrele(vp); |
3c37c940 MD |
768 | } |
769 | ||
3c37c940 | 770 | /* |
62ae46c9 MD |
771 | * Try to reuse a vnode from the free list. This function is somewhat |
772 | * advisory in that NULL can be returned as a normal case, even if free | |
773 | * vnodes are present. | |
ac88f01a | 774 | * |
62ae46c9 MD |
775 | * The scan is limited because it can result in excessive CPU use during |
776 | * periods of extreme vnode use. | |
ac88f01a | 777 | * |
62ae46c9 | 778 | * NOTE: The returned vnode is not completely initialized. |
fc36a10b | 779 | * The returned vnode will be VX locked. |
3c37c940 MD |
780 | */ |
781 | static | |
782 | struct vnode * | |
ee173d09 | 783 | cleanfreevnode(int maxcount) |
3c37c940 | 784 | { |
124072b8 | 785 | struct vnode_index *vi; |
3c37c940 MD |
786 | struct vnode *vp; |
787 | int count; | |
cb3297fb | 788 | int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); |
124072b8 MD |
789 | int ri; |
790 | int cpu_count; | |
8938f217 | 791 | int cachedvnodes; |
3c37c940 | 792 | |
ee173d09 | 793 | /* |
8938f217 MD |
794 | * Try to deactivate some vnodes cached on the active list. We |
795 | * generally want a 50-50 balance active vs inactive. | |
ee173d09 | 796 | */ |
8938f217 MD |
797 | cachedvnodes = countcachedvnodes(); |
798 | if (cachedvnodes < inactivevnodes) | |
2097b3da | 799 | goto skip; |
ee173d09 | 800 | |
124072b8 MD |
801 | ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; |
802 | ||
803 | for (count = 0; count < maxcount * 2; ++count, ++ri) { | |
804 | vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; | |
805 | ||
806 | spin_lock(&vi->spin); | |
2097b3da | 807 | |
124072b8 MD |
808 | vp = TAILQ_NEXT(&vi->active_rover, v_list); |
809 | TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); | |
ee173d09 | 810 | if (vp == NULL) { |
124072b8 MD |
811 | TAILQ_INSERT_HEAD(&vi->active_list, |
812 | &vi->active_rover, v_list); | |
ee173d09 | 813 | } else { |
124072b8 MD |
814 | TAILQ_INSERT_AFTER(&vi->active_list, vp, |
815 | &vi->active_rover, v_list); | |
ee173d09 | 816 | } |
cb3297fb | 817 | if (vp == NULL) { |
124072b8 | 818 | spin_unlock(&vi->spin); |
cb3297fb MD |
819 | continue; |
820 | } | |
8938f217 MD |
821 | |
822 | /* | |
823 | * Don't try to deactivate if someone has the vp referenced. | |
824 | */ | |
cb3297fb | 825 | if ((vp->v_refcnt & VREF_MASK) != 0) { |
124072b8 | 826 | spin_unlock(&vi->spin); |
cb3297fb MD |
827 | vp->v_act += VACT_INC; |
828 | if (vp->v_act > VACT_MAX) /* SMP race ok */ | |
829 | vp->v_act = VACT_MAX; | |
2097b3da MD |
830 | continue; |
831 | } | |
cb3297fb MD |
832 | |
833 | /* | |
8938f217 MD |
834 | * Calculate the deactivation weight. Reduce v_act less |
835 | * if the vnode's object has a lot of VM pages. | |
836 | * | |
837 | * XXX obj race | |
cb3297fb | 838 | */ |
2097b3da | 839 | if (vp->v_act > 0) { |
cb3297fb | 840 | vm_object_t obj; |
8938f217 | 841 | |
cb3297fb | 842 | if ((obj = vp->v_object) != NULL && |
8938f217 MD |
843 | obj->resident_page_count >= trigger) |
844 | { | |
cb3297fb MD |
845 | vp->v_act -= 1; |
846 | } else { | |
847 | vp->v_act -= VACT_INC; | |
848 | } | |
849 | if (vp->v_act < 0) | |
850 | vp->v_act = 0; | |
124072b8 | 851 | spin_unlock(&vi->spin); |
ee173d09 SW |
852 | continue; |
853 | } | |
854 | ||
3c37c940 | 855 | /* |
8938f217 MD |
856 | * If v_auxrefs is not the expected value the vnode might |
857 | * reside in the namecache topology on an internal node and | |
858 | * not at a leaf. v_auxrefs can be wrong for other reasons, | |
859 | * but this is the most likely. | |
860 | * | |
861 | * Such vnodes will not be recycled by vnlru later on in | |
862 | * its inactive scan, so try to make the vnode presentable | |
863 | * and only move it to the inactive queue if we can. | |
864 | * | |
865 | * On success, the vnode is disconnected from the namecache | |
866 | * topology entirely, making vnodes above it in the topology | |
867 | * recycleable. This will allow the active scan to continue | |
868 | * to make progress in balancing the active and inactive | |
869 | * lists. | |
870 | */ | |
871 | if (vp->v_auxrefs != vp->v_namecache_count) { | |
872 | if (vx_get_nonblock(vp) == 0) { | |
873 | spin_unlock(&vi->spin); | |
874 | if ((vp->v_refcnt & VREF_MASK) == 1) | |
875 | cache_inval_vp_quick(vp); | |
876 | if (vp->v_auxrefs == vp->v_namecache_count) | |
877 | ++auxrecovervnodes1; | |
878 | vx_put(vp); | |
879 | } else { | |
880 | spin_unlock(&vi->spin); | |
881 | } | |
882 | continue; | |
883 | } | |
884 | ||
885 | /* | |
886 | * Try to deactivate the vnode. It is ok if v_auxrefs | |
887 | * races every once in a while, we just don't want an | |
888 | * excess of unreclaimable vnodes on the inactive list. | |
3c37c940 | 889 | */ |
2097b3da | 890 | if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) |
3536c341 | 891 | atomic_add_int(&mycpu->gd_cachedvnodes, -1); |
ee173d09 | 892 | atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); |
2097b3da | 893 | |
124072b8 | 894 | spin_unlock(&vi->spin); |
ee173d09 SW |
895 | vrele(vp); |
896 | } | |
897 | ||
124072b8 MD |
898 | vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; |
899 | ||
2097b3da | 900 | skip: |
ee173d09 SW |
901 | /* |
902 | * Loop trying to lock the first vnode on the free list. | |
903 | * Cycle if we can't. | |
ee173d09 | 904 | */ |
124072b8 MD |
905 | cpu_count = ncpus; |
906 | ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; | |
907 | ||
908 | for (count = 0; count < maxcount; ++count, ++ri) { | |
909 | vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; | |
910 | ||
911 | spin_lock(&vi->spin); | |
2097b3da | 912 | |
124072b8 | 913 | vp = TAILQ_FIRST(&vi->inactive_list); |
62ae46c9 | 914 | if (vp == NULL) { |
124072b8 MD |
915 | spin_unlock(&vi->spin); |
916 | if (--cpu_count == 0) | |
917 | break; | |
918 | ri = (ri + 16) & ~15; | |
919 | --ri; | |
920 | continue; | |
62ae46c9 | 921 | } |
2097b3da MD |
922 | |
923 | /* | |
924 | * non-blocking vx_get will also ref the vnode on success. | |
925 | */ | |
926 | if (vx_get_nonblock(vp)) { | |
ee173d09 | 927 | KKASSERT(vp->v_state == VS_INACTIVE); |
124072b8 MD |
928 | TAILQ_REMOVE(&vi->inactive_list, vp, v_list); |
929 | TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); | |
930 | spin_unlock(&vi->spin); | |
3c37c940 MD |
931 | continue; |
932 | } | |
ac88f01a MD |
933 | |
934 | /* | |
2097b3da MD |
935 | * Because we are holding vfs_spin the vnode should currently |
936 | * be inactive and VREF_TERMINATE should still be set. | |
ee173d09 | 937 | * |
2097b3da MD |
938 | * Once vfs_spin is released the vnode's state should remain |
939 | * unmodified due to both the lock and ref on it. | |
ac88f01a | 940 | */ |
ee173d09 | 941 | KKASSERT(vp->v_state == VS_INACTIVE); |
124072b8 | 942 | spin_unlock(&vi->spin); |
0e8bd897 | 943 | #ifdef TRACKVNODE |
1028a5f8 | 944 | if ((u_long)vp == trackvnode) |
ee173d09 | 945 | kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); |
0e8bd897 | 946 | #endif |
2097b3da | 947 | |
8938f217 MD |
948 | /* |
949 | * The active scan already did this, but some leakage can | |
950 | * happen. Don't let an easily recycleable vnode go to | |
951 | * waste! | |
952 | */ | |
953 | if (vp->v_auxrefs != vp->v_namecache_count && | |
954 | (vp->v_refcnt & ~VREF_FINALIZE) == VREF_TERMINATE + 1) | |
955 | { | |
956 | cache_inval_vp_quick(vp); | |
957 | if (vp->v_auxrefs == vp->v_namecache_count) | |
958 | ++auxrecovervnodes2; | |
959 | } | |
960 | ||
18c4feea | 961 | /* |
ac88f01a MD |
962 | * Do not reclaim/reuse a vnode while auxillary refs exists. |
963 | * This includes namecache refs due to a related ncp being | |
62ae46c9 MD |
964 | * locked or having children, a VM object association, or |
965 | * other hold users. | |
ac88f01a | 966 | * |
2097b3da MD |
967 | * Do not reclaim/reuse a vnode if someone else has a real |
968 | * ref on it. This can occur if a filesystem temporarily | |
969 | * releases the vnode lock during VOP_RECLAIM. | |
18c4feea | 970 | */ |
03081357 | 971 | if (vp->v_auxrefs != vp->v_namecache_count || |
2097b3da MD |
972 | (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { |
973 | failed: | |
974 | if (vp->v_state == VS_INACTIVE) { | |
124072b8 | 975 | spin_lock(&vi->spin); |
2097b3da | 976 | if (vp->v_state == VS_INACTIVE) { |
124072b8 | 977 | TAILQ_REMOVE(&vi->inactive_list, |
2097b3da | 978 | vp, v_list); |
124072b8 | 979 | TAILQ_INSERT_TAIL(&vi->inactive_list, |
2097b3da MD |
980 | vp, v_list); |
981 | } | |
124072b8 | 982 | spin_unlock(&vi->spin); |
2097b3da MD |
983 | } |
984 | vx_put(vp); | |
18c4feea MD |
985 | continue; |
986 | } | |
3c37c940 | 987 | |
3c37c940 | 988 | /* |
2097b3da MD |
989 | * VINACTIVE and VREF_TERMINATE are expected to both be set |
990 | * for vnodes pulled from the inactive list, and cannot be | |
991 | * changed while we hold the vx lock. | |
799f78b8 | 992 | * |
2097b3da | 993 | * Try to reclaim the vnode. |
03081357 MD |
994 | * |
995 | * The cache_inval_vp() can fail if any of the namecache | |
996 | * elements are actively locked, preventing the vnode from | |
997 | * bring reclaimed. This is desired operation as it gives | |
998 | * the namecache code certain guarantees just by holding | |
999 | * a ncp. | |
3c37c940 | 1000 | */ |
2097b3da MD |
1001 | KKASSERT(vp->v_flag & VINACTIVE); |
1002 | KKASSERT(vp->v_refcnt & VREF_TERMINATE); | |
1003 | ||
799f78b8 | 1004 | if ((vp->v_flag & VRECLAIMED) == 0) { |
2097b3da MD |
1005 | if (cache_inval_vp_nonblock(vp)) |
1006 | goto failed; | |
3c37c940 | 1007 | vgone_vxlocked(vp); |
799f78b8 MD |
1008 | /* vnode is still VX locked */ |
1009 | } | |
3c37c940 MD |
1010 | |
1011 | /* | |
2097b3da MD |
1012 | * At this point if there are no other refs or auxrefs on |
1013 | * the vnode with the inactive list locked, and we remove | |
1014 | * the vnode from the inactive list, it should not be | |
1015 | * possible for anyone else to access the vnode any more. | |
799f78b8 MD |
1016 | * |
1017 | * Since the vnode is in a VRECLAIMED state, no new | |
2097b3da MD |
1018 | * namecache associations could have been made and the |
1019 | * vnode should have already been removed from its mountlist. | |
1020 | * | |
1021 | * Since we hold a VX lock on the vnode it cannot have been | |
1022 | * reactivated (moved out of the inactive list). | |
3c37c940 | 1023 | */ |
799f78b8 | 1024 | KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); |
124072b8 | 1025 | spin_lock(&vi->spin); |
3c37c940 | 1026 | if (vp->v_auxrefs || |
2097b3da | 1027 | (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { |
124072b8 | 1028 | spin_unlock(&vi->spin); |
2097b3da | 1029 | goto failed; |
3c37c940 | 1030 | } |
2097b3da | 1031 | KKASSERT(vp->v_state == VS_INACTIVE); |
124072b8 | 1032 | TAILQ_REMOVE(&vi->inactive_list, vp, v_list); |
93e77488 | 1033 | atomic_add_int(&mycpu->gd_inactivevnodes, -1); |
2097b3da | 1034 | vp->v_state = VS_DYING; |
124072b8 | 1035 | spin_unlock(&vi->spin); |
5c6c3cac | 1036 | |
ee173d09 | 1037 | /* |
2097b3da MD |
1038 | * Nothing should have been able to access this vp. Only |
1039 | * our ref should remain now. | |
ee173d09 SW |
1040 | */ |
1041 | atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); | |
2097b3da | 1042 | KASSERT(vp->v_refcnt == 1, |
ee173d09 SW |
1043 | ("vp %p badrefs %08x", vp, vp->v_refcnt)); |
1044 | ||
5c6c3cac | 1045 | /* |
2097b3da | 1046 | * Return a VX locked vnode suitable for reuse. |
5c6c3cac | 1047 | */ |
124072b8 | 1048 | vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; |
3c37c940 MD |
1049 | return(vp); |
1050 | } | |
124072b8 | 1051 | vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; |
3c37c940 MD |
1052 | return(NULL); |
1053 | } | |
1054 | ||
5fd012e0 | 1055 | /* |
62ae46c9 | 1056 | * Obtain a new vnode. The returned vnode is VX locked & vrefd. |
0e8bd897 MD |
1057 | * |
1058 | * All new vnodes set the VAGE flags. An open() of the vnode will | |
1059 | * decrement the (2-bit) flags. Vnodes which are opened several times | |
1060 | * are thus retained in the cache over vnodes which are merely stat()d. | |
2247fe02 | 1061 | * |
93e77488 MD |
1062 | * We attempt to reuse an already-recycled vnode from our pcpu inactive |
1063 | * queue first, and allocate otherwise. Attempting to recycle inactive | |
1064 | * vnodes here can lead to numerous deadlocks, particularly with | |
1065 | * softupdates. | |
5fd012e0 MD |
1066 | */ |
1067 | struct vnode * | |
1068 | allocvnode(int lktimeout, int lkflags) | |
1069 | { | |
5fd012e0 | 1070 | struct vnode *vp; |
93e77488 MD |
1071 | struct vnode_index *vi; |
1072 | ||
1073 | /* | |
1074 | * lktimeout only applies when LK_TIMELOCK is used, and only | |
1075 | * the pageout daemon uses it. The timeout may not be zero | |
1076 | * or the pageout daemon can deadlock in low-VM situations. | |
1077 | */ | |
1078 | if (lktimeout == 0) | |
1079 | lktimeout = hz / 10; | |
5fd012e0 MD |
1080 | |
1081 | /* | |
cb3297fb MD |
1082 | * Do not flag for synchronous recyclement unless there are enough |
1083 | * freeable vnodes to recycle and the number of vnodes has | |
1084 | * significantly exceeded our target. We want the normal vnlru | |
1085 | * process to handle the cleaning (at 9/10's) before we are forced | |
1086 | * to flag it here at 11/10's for userexit path processing. | |
5fd012e0 | 1087 | */ |
9629eb35 MD |
1088 | if (numvnodes >= maxvnodes * 11 / 10 && |
1089 | cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { | |
62ae46c9 MD |
1090 | struct thread *td = curthread; |
1091 | if (td->td_lwp) | |
1092 | atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); | |
5fd012e0 | 1093 | } |
ee173d09 | 1094 | |
2097b3da | 1095 | /* |
93e77488 MD |
1096 | * Try to trivially reuse a reclaimed vnode from the head of the |
1097 | * inactive list for this cpu. Any vnode cycling which occurs | |
1098 | * which terminates the vnode will cause it to be returned to the | |
1099 | * same pcpu structure (e.g. unlink calls). | |
2097b3da | 1100 | */ |
93e77488 MD |
1101 | vi = &vnode_list_hash[mycpuid]; |
1102 | spin_lock(&vi->spin); | |
1103 | ||
1104 | vp = TAILQ_FIRST(&vi->inactive_list); | |
1105 | if (vp && (vp->v_flag & VRECLAIMED)) { | |
1106 | /* | |
1107 | * non-blocking vx_get will also ref the vnode on success. | |
1108 | */ | |
1109 | if (vx_get_nonblock(vp)) { | |
1110 | KKASSERT(vp->v_state == VS_INACTIVE); | |
1111 | TAILQ_REMOVE(&vi->inactive_list, vp, v_list); | |
1112 | TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); | |
1113 | spin_unlock(&vi->spin); | |
1114 | goto slower; | |
1115 | } | |
1116 | ||
1117 | /* | |
1118 | * Because we are holding vfs_spin the vnode should currently | |
1119 | * be inactive and VREF_TERMINATE should still be set. | |
1120 | * | |
1121 | * Once vfs_spin is released the vnode's state should remain | |
1122 | * unmodified due to both the lock and ref on it. | |
1123 | */ | |
1124 | KKASSERT(vp->v_state == VS_INACTIVE); | |
1125 | #ifdef TRACKVNODE | |
1126 | if ((u_long)vp == trackvnode) | |
1127 | kprintf("allocvnode %p %08x\n", vp, vp->v_flag); | |
1128 | #endif | |
1129 | ||
1130 | /* | |
1131 | * Do not reclaim/reuse a vnode while auxillary refs exists. | |
1132 | * This includes namecache refs due to a related ncp being | |
1133 | * locked or having children, a VM object association, or | |
1134 | * other hold users. | |
1135 | * | |
1136 | * Do not reclaim/reuse a vnode if someone else has a real | |
1137 | * ref on it. This can occur if a filesystem temporarily | |
1138 | * releases the vnode lock during VOP_RECLAIM. | |
1139 | */ | |
1140 | if (vp->v_auxrefs || | |
1141 | (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { | |
1142 | if (vp->v_state == VS_INACTIVE) { | |
9b7896c4 MD |
1143 | TAILQ_REMOVE(&vi->inactive_list, |
1144 | vp, v_list); | |
1145 | TAILQ_INSERT_TAIL(&vi->inactive_list, | |
1146 | vp, v_list); | |
93e77488 MD |
1147 | } |
1148 | spin_unlock(&vi->spin); | |
1149 | vx_put(vp); | |
1150 | goto slower; | |
1151 | } | |
1152 | ||
1153 | /* | |
1154 | * VINACTIVE and VREF_TERMINATE are expected to both be set | |
1155 | * for vnodes pulled from the inactive list, and cannot be | |
1156 | * changed while we hold the vx lock. | |
1157 | * | |
1158 | * Try to reclaim the vnode. | |
1159 | */ | |
1160 | KKASSERT(vp->v_flag & VINACTIVE); | |
1161 | KKASSERT(vp->v_refcnt & VREF_TERMINATE); | |
2097b3da | 1162 | |
93e77488 MD |
1163 | if ((vp->v_flag & VRECLAIMED) == 0) { |
1164 | spin_unlock(&vi->spin); | |
1165 | vx_put(vp); | |
1166 | goto slower; | |
1167 | } | |
1168 | ||
1169 | /* | |
1170 | * At this point if there are no other refs or auxrefs on | |
1171 | * the vnode with the inactive list locked, and we remove | |
1172 | * the vnode from the inactive list, it should not be | |
1173 | * possible for anyone else to access the vnode any more. | |
1174 | * | |
1175 | * Since the vnode is in a VRECLAIMED state, no new | |
1176 | * namecache associations could have been made and the | |
1177 | * vnode should have already been removed from its mountlist. | |
1178 | * | |
1179 | * Since we hold a VX lock on the vnode it cannot have been | |
1180 | * reactivated (moved out of the inactive list). | |
1181 | */ | |
1182 | KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); | |
1183 | KKASSERT(vp->v_state == VS_INACTIVE); | |
1184 | TAILQ_REMOVE(&vi->inactive_list, vp, v_list); | |
1185 | atomic_add_int(&mycpu->gd_inactivevnodes, -1); | |
1186 | vp->v_state = VS_DYING; | |
1187 | spin_unlock(&vi->spin); | |
1188 | ||
1189 | /* | |
1190 | * Nothing should have been able to access this vp. Only | |
1191 | * our ref should remain now. | |
1192 | * | |
1193 | * At this point we can kfree() the vnode if we want to. | |
1194 | * Instead, we reuse it for the allocation. | |
1195 | */ | |
1196 | atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); | |
1197 | KASSERT(vp->v_refcnt == 1, | |
1198 | ("vp %p badrefs %08x", vp, vp->v_refcnt)); | |
fc36a10b | 1199 | vx_unlock(vp); /* safety: keep the API clean */ |
93e77488 MD |
1200 | bzero(vp, sizeof(*vp)); |
1201 | } else { | |
1202 | spin_unlock(&vi->spin); | |
1203 | slower: | |
e9dbfea1 | 1204 | vp = kmalloc_obj(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); |
93e77488 MD |
1205 | atomic_add_int(&numvnodes, 1); |
1206 | } | |
ee173d09 SW |
1207 | |
1208 | lwkt_token_init(&vp->v_token, "vnode"); | |
2097b3da | 1209 | lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); |
ee173d09 SW |
1210 | TAILQ_INIT(&vp->v_namecache); |
1211 | RB_INIT(&vp->v_rbclean_tree); | |
1212 | RB_INIT(&vp->v_rbdirty_tree); | |
1213 | RB_INIT(&vp->v_rbhash_tree); | |
ba87a4ab | 1214 | spin_init(&vp->v_spin, "allocvnode"); |
ee173d09 | 1215 | |
fc36a10b | 1216 | vx_lock(vp); |
ee173d09 | 1217 | vp->v_refcnt = 1; |
0e8bd897 | 1218 | vp->v_flag = VAGE0 | VAGE1; |
d84f6fa1 | 1219 | vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; |
f043c4c7 | 1220 | |
2779ba31 | 1221 | KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); |
3c37c940 | 1222 | /* exclusive lock still held */ |
5fd012e0 | 1223 | |
57f7b636 | 1224 | vp->v_filesize = NOOFFSET; |
5fd012e0 MD |
1225 | vp->v_type = VNON; |
1226 | vp->v_tag = 0; | |
ee173d09 SW |
1227 | vp->v_state = VS_CACHED; |
1228 | _vactivate(vp); | |
3c37c940 | 1229 | |
5fd012e0 MD |
1230 | return (vp); |
1231 | } | |
1232 | ||
62ae46c9 MD |
1233 | /* |
1234 | * Called after a process has allocated a vnode via allocvnode() | |
1235 | * and we detected that too many vnodes were present. | |
1236 | * | |
62ae46c9 MD |
1237 | * This function is called just prior to a return to userland if the |
1238 | * process at some point had to allocate a new vnode during the last | |
1239 | * system call and the vnode count was found to be excessive. | |
1240 | * | |
cb3297fb MD |
1241 | * This is a synchronous path that we do not normally want to execute. |
1242 | * | |
1243 | * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. | |
1244 | * | |
62ae46c9 MD |
1245 | * WARNING: Sometimes numvnodes can blow out due to children being |
1246 | * present under directory vnodes in the namecache. For the | |
1247 | * moment use an if() instead of a while() and note that if | |
1248 | * we were to use a while() we would still have to break out | |
cb3297fb MD |
1249 | * if freesomevnodes() returned 0. vnlru will also be trying |
1250 | * hard to free vnodes at the same time (with a lower trigger | |
1251 | * pointer). | |
62ae46c9 MD |
1252 | */ |
1253 | void | |
1254 | allocvnode_gc(void) | |
1255 | { | |
9629eb35 | 1256 | if (numvnodes >= maxvnodes && |
8938f217 MD |
1257 | countcachedandinactivevnodes() >= maxvnodes * 5 / 10) |
1258 | { | |
62ae46c9 | 1259 | freesomevnodes(batchfreevnodes); |
2097b3da | 1260 | } |
62ae46c9 MD |
1261 | } |
1262 | ||
3c37c940 MD |
1263 | int |
1264 | freesomevnodes(int n) | |
1265 | { | |
1266 | struct vnode *vp; | |
1267 | int count = 0; | |
1268 | ||
1269 | while (n) { | |
2097b3da | 1270 | if ((vp = cleanfreevnode(n)) == NULL) |
3c37c940 | 1271 | break; |
2097b3da | 1272 | vx_unlock(vp); |
62ae46c9 MD |
1273 | --n; |
1274 | ++count; | |
e9dbfea1 | 1275 | kfree_obj(vp, M_VNODE); |
62ae46c9 | 1276 | atomic_add_int(&numvnodes, -1); |
3c37c940 MD |
1277 | } |
1278 | return(count); | |
1279 | } |