From e83d5c121c899edad9e4cab9fee4465c6d39d04c Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 19 Mar 2018 18:39:21 -0700 Subject: [PATCH] kernel - Attempt to fix high vnlru cpu use * In certain situations _cache_cleanpos() gets into a livelock of some sort. It is limited by a counter but the livelock creates havoc when it happens. * It appears that _cache_cleanpos() can get into a situation where the head element of all available nchashtbl[] slots is not freeable (e.g. due to having multiple refs). When this occurs, trailing elements in the chained hash table cannot be recycled. nchashtbl[] is typically very large, so this situation does not happen very often. There are usually plenty of entries to pick away at. However, it looks like situations can develop where enough entries get into this masked state that the hysteresis is unable to complete, resulting in a cpubound loop. * Refactor the chained list from LIST to TAILQ and modify the _cache_cleanpos() function to cycle ncp's to the end of the list before trying to zap them. If the zap fails, the next iteration will encounter a different head. Reported-by: ftigeot --- sys/kern/vfs_cache.c | 52 +++++++++++++++++++++++++++++--------------- sys/sys/namecache.h | 2 +- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 3d70b5081c..c01afe7945 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -124,7 +124,7 @@ MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); -LIST_HEAD(nchash_list, namecache); +TAILQ_HEAD(nchash_list, namecache); /* * Don't cachealign, but at least pad to 32 bytes so entries @@ -909,7 +909,7 @@ _cache_link_parent(struct namecache *ncp, struct namecache *par, /* * Add to hash table and parent, adjust accounting */ - LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash); + TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash); atomic_add_long(&pn->vfscache_count, 1); if (TAILQ_EMPTY(&ncp->nc_list)) atomic_add_long(&pn->vfscache_leafs, 1); @@ -952,7 +952,7 @@ _cache_unlink_parent(struct namecache *ncp) /* * Remove from hash table and parent, adjust accounting */ - LIST_REMOVE(ncp, nc_hash); + TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); atomic_add_long(&pn->vfscache_count, -1); if (TAILQ_EMPTY(&ncp->nc_list)) @@ -2817,7 +2817,7 @@ cache_zap(struct namecache *ncp, int nonblock) struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid]; KKASSERT(nchpp == ncp->nc_head); - LIST_REMOVE(ncp, nc_hash); + TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash); TAILQ_REMOVE(&par->nc_list, ncp, nc_entry); atomic_add_long(&pn->vfscache_count, -1); if (TAILQ_EMPTY(&ncp->nc_list)) @@ -3037,7 +3037,7 @@ restart: else spin_lock_shared(&nchpp->spin); - LIST_FOREACH(ncp, &nchpp->list, nc_hash) { + TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { /* * Break out if we find a matching entry. Note that * UNRESOLVED entries may match, but DESTROYED entries @@ -3181,7 +3181,7 @@ cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc, spin_lock_shared(&nchpp->spin); - LIST_FOREACH(ncp, &nchpp->list, nc_hash) { + TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { /* * Break out if we find a matching entry. Note that * UNRESOLVED entries may match, but DESTROYED entries @@ -3263,7 +3263,7 @@ cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) nchpp = NCHHASH(hash); restart: spin_lock(&nchpp->spin); - LIST_FOREACH(ncp, &nchpp->list, nc_hash) { + TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) { /* * Break out if we find a matching entry. Note that * UNRESOLVED entries may match, but DESTROYED entries @@ -3891,13 +3891,29 @@ _cache_cleanpos(long count) cpu_ccfence(); nchpp = NCHHASH(rover_copy); - spin_lock_shared(&nchpp->spin); - ncp = LIST_FIRST(&nchpp->list); + if (TAILQ_FIRST(&nchpp->list) == NULL) { + --count; + continue; + } + + /* + * Cycle ncp on list, ignore and do not move DESTROYED + * ncps (which might be dummies). + * + * We must cycle the ncp to the end of the list to + * ensure that all ncp's have an equal chance of + * being removed. + */ + spin_lock(&nchpp->spin); + ncp = TAILQ_FIRST(&nchpp->list); while (ncp && (ncp->nc_flag & NCF_DESTROYED)) - ncp = LIST_NEXT(ncp, nc_hash); - if (ncp) + ncp = TAILQ_NEXT(ncp, nc_hash); + if (ncp) { + TAILQ_REMOVE(&nchpp->list, ncp, nc_hash); + TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash); _cache_hold(ncp); - spin_unlock_shared(&nchpp->spin); + } + spin_unlock(&nchpp->spin); if (ncp) { if (_cache_lock_special(ncp) == 0) { @@ -3937,13 +3953,13 @@ _cache_cleandefered(void) nchpp = &nchashtbl[i]; spin_lock(&nchpp->spin); - LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); + TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash); ncp = &dummy; - while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) { + while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) { if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0) continue; - LIST_REMOVE(&dummy, nc_hash); - LIST_INSERT_AFTER(ncp, &dummy, nc_hash); + TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); + TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash); _cache_hold(ncp); spin_unlock(&nchpp->spin); if (_cache_lock_nonblock(ncp) == 0) { @@ -3954,7 +3970,7 @@ _cache_cleandefered(void) spin_lock(&nchpp->spin); ncp = &dummy; } - LIST_REMOVE(&dummy, nc_hash); + TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash); spin_unlock(&nchpp->spin); } } @@ -3995,7 +4011,7 @@ nchinit(void) sizeof(struct nchash_head), M_VFSCACHE, &nchash); for (i = 0; i <= (int)nchash; ++i) { - LIST_INIT(&nchashtbl[i].list); + TAILQ_INIT(&nchashtbl[i].list); spin_init(&nchashtbl[i].spin, "nchinit_hash"); } for (i = 0; i < NCMOUNT_NUMCACHE; ++i) diff --git a/sys/sys/namecache.h b/sys/sys/namecache.h index 8632e7aaba..80733c9708 100644 --- a/sys/sys/namecache.h +++ b/sys/sys/namecache.h @@ -113,7 +113,7 @@ TAILQ_HEAD(namecache_list, namecache); * operations vector is typically obtained via nc_mount->mnt_vn_use_ops. */ struct namecache { - LIST_ENTRY(namecache) nc_hash; /* hash chain (nc_parent,name) */ + TAILQ_ENTRY(namecache) nc_hash; /* hash chain (nc_parent,name) */ TAILQ_ENTRY(namecache) nc_entry; /* scan via nc_parent->nc_list */ TAILQ_ENTRY(namecache) nc_vnode; /* scan via vnode->v_namecache */ struct namecache_list nc_list; /* list of children */ -- 2.41.0