From: Matthew Dillon Date: Sat, 8 Dec 2012 02:52:30 +0000 (-0800) Subject: kernel - Change allocvnode() to not recursively block freeing vnodes X-Git-Tag: v3.2.2~9 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/3dc54cbd317ca29e0cd0e5b0ed07668ca3307280 kernel - Change allocvnode() to not recursively block freeing vnodes allocvnode() has caused many deadlock issues over the years, including recent issues with softupdates, because it is often called from deep within VFS modules and attempts to clean and free unrelated vnodes when the vnode limit is reached to make room for the new one. * numvnodes is not protected by any locks and needs atomic ops. * Change allocvnode() to always allocate and not attempt to free other vnodes. * allocvnode() now flags the LWP to handle reducing the number of vnodes in the system as of when it returns to userland instead. Consolidate several flags into a single conditional function call, lwpuserret(). When triggered, this code will do a limited scan of the free list to try to find vnodes to free. * The vnlru_proc_wait() code existed to handle a separate algorithm related to vnodes with cached buffers and VM pages but represented a major bottleneck in the system. Remove vnlru_proc_wait() and allow vnodes with buffers and/or non-empty VM objects to be placed on the free list. This also requires not vhold()ing the vnode for related buffer cache buffer since the vnode will not go away until related buffers have been cleaned out. We shouldn't need those holds. Testing-by: vsrinivas --- diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index 3af2cd30a6..3dea3c09a8 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -851,6 +852,26 @@ proc_remove_zombie(struct proc *p) lwkt_reltoken(&proc_token); } +/* + * Handle various requirements prior to returning to usermode. Called from + * platform trap and system call code. + */ +void +lwpuserret(struct lwp *lp) +{ + struct proc *p = lp->lwp_proc; + + if (lp->lwp_mpflags & LWP_MP_VNLRU) { + atomic_clear_int(&lp->lwp_mpflags, LWP_MP_VNLRU); + allocvnode_gc(); + } + if (lp->lwp_mpflags & LWP_MP_WEXIT) { + lwkt_gettoken(&p->p_token); + lwp_exit(0); + lwkt_reltoken(&p->p_token); /* NOT REACHED */ + } +} + /* * Scan all processes on the allproc list. The process is automatically * held for the callback. A return value of -1 terminates the loop. diff --git a/sys/kern/vfs_lock.c b/sys/kern/vfs_lock.c index ea48949904..e5394fb739 100644 --- a/sys/kern/vfs_lock.c +++ b/sys/kern/vfs_lock.c @@ -95,6 +95,9 @@ SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, static int wantfreevnodes = 25; SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Desired number of free vnodes"); +static int batchfreevnodes = 5; +SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, + &batchfreevnodes, 0, "Number of vnodes to free at once"); #ifdef TRACKVNODE static ulong trackvnode; SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, @@ -244,6 +247,11 @@ __vfreetail(struct vnode *vp) * This routine is only valid if the vnode is already either VFREE or * VCACHED, or if it can become VFREE or VCACHED via vnode_terminate(). * + * WARNING! We used to indicate FALSE if the vnode had an object with + * resident pages but we no longer do that because it makes + * managing kern.maxvnodes difficult. Instead we rely on vfree() + * to place the vnode properly on the list. + * * WARNING! This functions is typically called with v_spin held. * * MPSAFE @@ -251,8 +259,10 @@ __vfreetail(struct vnode *vp) static __inline boolean_t vshouldfree(struct vnode *vp) { - return (vp->v_auxrefs == 0 && - (vp->v_object == NULL || vp->v_object->resident_page_count == 0)); + return (vp->v_auxrefs == 0); +#if 0 + && (vp->v_object == NULL || vp->v_object->resident_page_count == 0)); +#endif } /* @@ -646,7 +656,7 @@ vx_put(struct vnode *vp) */ static void -vnode_rover_locked(void) +vnode_free_rover_scan_locked(void) { struct vnode *vp; @@ -679,7 +689,6 @@ vnode_rover_locked(void) if (vp->v_object && vp->v_object->resident_page_count) { /* * Promote vnode with resident pages to section 3. - * (This case shouldn't happen). */ if (rover_state == ROVER_MID1) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); @@ -702,24 +711,37 @@ vnode_rover_locked(void) } } +void +vnode_free_rover_scan(int count) +{ + spin_lock(&vfs_spin); + while (count > 0) { + --count; + vnode_free_rover_scan_locked(); + } + spin_unlock(&vfs_spin); +} + /* - * Try to reuse a vnode from the free list. + * Try to reuse a vnode from the free list. This function is somewhat + * advisory in that NULL can be returned as a normal case, even if free + * vnodes are present. * - * NOTE: The returned vnode is not completely initialized. + * The scan is limited because it can result in excessive CPU use during + * periods of extreme vnode use. * - * WARNING: The freevnodes count can race, NULL can be returned even if - * freevnodes != 0. + * NOTE: The returned vnode is not completely initialized. * * MPSAFE */ static struct vnode * -allocfreevnode(void) +allocfreevnode(int maxcount) { struct vnode *vp; int count; - for (count = 0; count < freevnodes; count++) { + for (count = 0; count < maxcount; count++) { /* * Try to lock the first vnode on the free list. * Cycle if we can't. @@ -730,15 +752,17 @@ allocfreevnode(void) * vhold here. */ spin_lock(&vfs_spin); - vnode_rover_locked(); - vnode_rover_locked(); + vnode_free_rover_scan_locked(); + vnode_free_rover_scan_locked(); vp = TAILQ_FIRST(&vnode_free_list); while (vp == &vnode_free_mid1 || vp == &vnode_free_mid2 || vp == &vnode_free_rover) { vp = TAILQ_NEXT(vp, v_freelist); } - if (vp == NULL) + if (vp == NULL) { + spin_unlock(&vfs_spin); break; + } if (vx_lock_nonblock(vp)) { KKASSERT(vp->v_flag & VFREE); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); @@ -764,7 +788,8 @@ allocfreevnode(void) /* * Do not reclaim/reuse a vnode while auxillary refs exists. * This includes namecache refs due to a related ncp being - * locked or having children. + * locked or having children, a VM object association, or + * other hold users. * * We will make this test several times as auxrefs can * get incremented on us without any spinlocks being held @@ -846,14 +871,14 @@ allocfreevnode(void) } /* - * Obtain a new vnode from the freelist, allocating more if necessary. - * The returned vnode is VX locked & vrefd. + * Obtain a new vnode. The returned vnode is VX locked & vrefd. * * All new vnodes set the VAGE flags. An open() of the vnode will * decrement the (2-bit) flags. Vnodes which are opened several times * are thus retained in the cache over vnodes which are merely stat()d. * - * MPSAFE + * We always allocate the vnode. Attempting to recycle existing vnodes + * here can lead to numerous deadlocks, particularly with softupdates. */ struct vnode * allocvnode(int lktimeout, int lkflags) @@ -861,34 +886,18 @@ allocvnode(int lktimeout, int lkflags) struct vnode *vp; /* - * Try to reuse vnodes if we hit the max. This situation only - * occurs in certain large-memory (2G+) situations. We cannot - * attempt to directly reclaim vnodes due to nasty recursion - * problems. + * Do not flag for recyclement unless there are enough free vnodes + * to recycle and the number of vnodes has exceeded our target. */ - while (numvnodes - freevnodes > desiredvnodes) - vnlru_proc_wait(); - - /* - * Try to build up as many vnodes as we can before reallocating - * from the free list. A vnode on the free list simply means - * that it is inactive with no resident pages. It may or may not - * have been reclaimed and could have valuable information associated - * with it that we shouldn't throw away unless we really need to. - * - * HAMMER NOTE: Re-establishing a vnode is a fairly expensive - * operation for HAMMER but this should benefit UFS as well. - */ - if (freevnodes >= wantfreevnodes && numvnodes >= desiredvnodes) - vp = allocfreevnode(); - else - vp = NULL; - if (vp == NULL) { - vp = sysref_alloc(&vnode_sysref_class); - KKASSERT((vp->v_flag & (VCACHED|VFREE)) == 0); - lockmgr(&vp->v_lock, LK_EXCLUSIVE); - numvnodes++; + if (freevnodes >= wantfreevnodes && numvnodes >= desiredvnodes) { + struct thread *td = curthread; + if (td->td_lwp) + atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); } + vp = sysref_alloc(&vnode_sysref_class); + KKASSERT((vp->v_flag & (VCACHED|VFREE)) == 0); + lockmgr(&vp->v_lock, LK_EXCLUSIVE); + atomic_add_int(&numvnodes, 1); /* * We are using a managed sysref class, vnode fields are only @@ -950,6 +959,32 @@ allocvnode(int lktimeout, int lkflags) return (vp); } +/* + * Called after a process has allocated a vnode via allocvnode() + * and we detected that too many vnodes were present. + * + * Try to reuse vnodes if we hit the max. This situation only + * occurs in certain large-memory (2G+) situations on 32 bit systems, + * or if kern.maxvnodes is set to very low values. + * + * This function is called just prior to a return to userland if the + * process at some point had to allocate a new vnode during the last + * system call and the vnode count was found to be excessive. + * + * WARNING: Sometimes numvnodes can blow out due to children being + * present under directory vnodes in the namecache. For the + * moment use an if() instead of a while() and note that if + * we were to use a while() we would still have to break out + * if freesomevnodes() returned 0. + */ +void +allocvnode_gc(void) +{ + if (numvnodes > desiredvnodes && freevnodes > wantfreevnodes) { + freesomevnodes(batchfreevnodes); + } +} + /* * MPSAFE */ @@ -960,11 +995,12 @@ freesomevnodes(int n) int count = 0; while (n) { - --n; - if ((vp = allocfreevnode()) == NULL) + if ((vp = allocfreevnode(n * 2)) == NULL) break; + --n; + ++count; vx_put(vp); - --numvnodes; + atomic_add_int(&numvnodes, -1); } return(count); } diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index e596cc3c38..11c2a1ef22 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -679,18 +679,6 @@ vlrureclaim(struct mount *mp, void *data) * interesting deadlock problems. */ static struct thread *vnlruthread; -static int vnlruproc_sig; - -void -vnlru_proc_wait(void) -{ - tsleep_interlock(&vnlruproc_sig, 0); - if (vnlruproc_sig == 0) { - vnlruproc_sig = 1; /* avoid unnecessary wakeups */ - wakeup(vnlruthread); - } - tsleep(&vnlruproc_sig, PINTERLOCKED, "vlruwk", hz); -} static void vnlru_proc(void) @@ -705,6 +693,16 @@ vnlru_proc(void) for (;;) { kproc_suspend_loop(); + /* + * Do some opportunistic roving. + */ + if (numvnodes > 100000) + vnode_free_rover_scan(50); + else if (numvnodes > 10000) + vnode_free_rover_scan(20); + else + vnode_free_rover_scan(5); + /* * Try to free some vnodes if we have too many */ @@ -724,8 +722,6 @@ vnlru_proc(void) * the free list. */ if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { - vnlruproc_sig = 0; - wakeup(&vnlruproc_sig); tsleep(vnlruthread, 0, "vlruwt", hz); continue; } diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index e6a0024746..f8ae7c32c0 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -919,7 +919,7 @@ bgetvp(struct vnode *vp, struct buf *bp, int testsize) bp->b_flags |= B_VNCLEAN; if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp); - vhold(vp); + /*vhold(vp);*/ lwkt_reltoken(&vp->v_token); return(0); } @@ -958,7 +958,7 @@ brelvp(struct buf *bp) lwkt_reltoken(&vp->v_token); - vdrop(vp); + /*vdrop(vp);*/ } /* diff --git a/sys/platform/pc32/i386/trap.c b/sys/platform/pc32/i386/trap.c index 5e429d81e0..c5ea096414 100644 --- a/sys/platform/pc32/i386/trap.c +++ b/sys/platform/pc32/i386/trap.c @@ -250,13 +250,11 @@ userret(struct lwp *lp, struct trapframe *frame, int sticks) recheck: /* - * If the jungle wants us dead, so be it. + * Specific on-return-to-usermode checks (LWP_MP_WEXIT, + * LWP_MP_VNLRU, etc). */ - if (lp->lwp_mpflags & LWP_MP_WEXIT) { - lwkt_gettoken(&p->p_token); - lwp_exit(0); - lwkt_reltoken(&p->p_token); /* NOT REACHED */ - } + if (lp->lwp_mpflags & LWP_MP_URETMASK) + lwpuserret(lp); /* * Block here if we are in a stopped state. diff --git a/sys/platform/pc64/x86_64/trap.c b/sys/platform/pc64/x86_64/trap.c index 0e373cfd9b..90e83cfa76 100644 --- a/sys/platform/pc64/x86_64/trap.c +++ b/sys/platform/pc64/x86_64/trap.c @@ -239,13 +239,11 @@ userret(struct lwp *lp, struct trapframe *frame, int sticks) recheck: /* - * If the jungle wants us dead, so be it. + * Specific on-return-to-usermode checks (LWP_MP_WEXIT, + * LWP_MP_VNLRU, etc). */ - if (lp->lwp_mpflags & LWP_MP_WEXIT) { - lwkt_gettoken(&p->p_token); - lwp_exit(0); - lwkt_reltoken(&p->p_token); /* NOT REACHED */ - } + if (lp->lwp_mpflags & LWP_MP_URETMASK) + lwpuserret(lp); /* * Block here if we are in a stopped state. diff --git a/sys/platform/vkernel/i386/trap.c b/sys/platform/vkernel/i386/trap.c index 40ac62d052..03bbb6d3ee 100644 --- a/sys/platform/vkernel/i386/trap.c +++ b/sys/platform/vkernel/i386/trap.c @@ -225,13 +225,11 @@ userret(struct lwp *lp, struct trapframe *frame, int sticks) recheck: /* - * If the jungle wants us dead, so be it. + * Specific on-return-to-usermode checks (LWP_MP_WEXIT, + * LWP_MP_VNLRU, etc). */ - if (lp->lwp_mpflags & LWP_MP_WEXIT) { - lwkt_gettoken(&p->p_token); - lwp_exit(0); - lwkt_reltoken(&p->p_token); /* NOT REACHED */ - } + if (lp->lwp_mpflags & LWP_MP_URETMASK) + lwpuserret(lp); /* * Block here if we are in a stopped state. diff --git a/sys/platform/vkernel64/x86_64/trap.c b/sys/platform/vkernel64/x86_64/trap.c index a0cd7188e1..fd196a4a61 100644 --- a/sys/platform/vkernel64/x86_64/trap.c +++ b/sys/platform/vkernel64/x86_64/trap.c @@ -225,13 +225,11 @@ userret(struct lwp *lp, struct trapframe *frame, int sticks) recheck: /* - * If the jungle wants us dead, so be it. + * Specific on-return-to-usermode checks (LWP_MP_WEXIT, + * LWP_MP_VNLRU, etc). */ - if (lp->lwp_mpflags & LWP_MP_WEXIT) { - lwkt_gettoken(&p->p_token); - lwp_exit(0); - lwkt_reltoken(&p->p_token); /* NOT REACHED */ - } + if (lp->lwp_mpflags & LWP_MP_URETMASK) + lwpuserret(lp); /* * Block here if we are in a stopped state. diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 27d5cf503d..ed18513678 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -398,10 +398,13 @@ struct proc { * thread has actually exited. */ #define LWP_MP_ONRUNQ 0x0000001 /* on a user scheduling run queue */ -#define LWP_MP_WEXIT 0x0000002 /* working on exiting */ +#define LWP_MP_WEXIT 0x0000002 /* working on exiting (lwpuserret()) */ #define LWP_MP_WSTOP 0x0000004 /* working on stopping */ #define LWP_MP_ULOAD 0x0000008 /* uload accounting for current cpu */ #define LWP_MP_RRFORCE 0x0000010 /* forced resched due to rrcount */ +#define LWP_MP_VNLRU 0x0000020 /* check excessive vnode allocations (lwpuserret()) */ + +#define LWP_MP_URETMASK (LWP_MP_WEXIT | LWP_MP_VNLRU) #define FIRST_LWP_IN_PROC(p) RB_FIRST(lwp_rb_tree, &(p)->p_lwp_tree) #define FOREACH_LWP_IN_PROC(lp, p) \ @@ -566,6 +569,7 @@ void prele (struct proc *); int pholdzomb (struct proc *); void prelezomb (struct proc *); void pstall (struct proc *, const char *, int); +void lwpuserret(struct lwp *); u_int32_t procrunnable (void); diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 7a116319eb..c5ea644c35 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -401,6 +401,8 @@ int v_associate_rdev(struct vnode *vp, cdev_t dev); void v_release_rdev(struct vnode *vp); int bdevvp (cdev_t dev, struct vnode **vpp); struct vnode *allocvnode(int lktimeout, int lkflags); +void allocvnode_gc(void); +void vnode_free_rover_scan(int count); int freesomevnodes(int count); int getnewvnode (enum vtagtype tag, struct mount *mp, struct vnode **vpp, int timo, int lkflags); @@ -550,8 +552,6 @@ void vn_syncer_thr_create(struct mount *); void *vn_syncer_thr_getctx(struct mount *); void vn_syncer_thr_stop(void *); -void vnlru_proc_wait(void); - extern struct vop_ops default_vnode_vops; extern struct vop_ops dead_vnode_vops;