From 8edfbc5ea8601645770484df0358fa4799ab68ef Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 5 Dec 2016 09:15:44 -0800 Subject: [PATCH] kernel - more kmalloc and nlookup performance optimizations * Give the pcpu counters in struct malloc_type their own cache line per cpu. This removes a large kmalloc/kfree bottleneck on multi-socket systems * Avoid having to ref, lock, and GETATTR intermediate directory components in nlookup() by adding the NCF_WXOK flag. This flag is set in the ncp when the directory permissions are at least 555. This saves significant overhead in all situations, including single-threaded. Discussed-with: Mateusz Guzik (mjg_) --- sys/kern/kern_slaballoc.c | 24 +++++------ sys/kern/vfs_cache.c | 83 +++++++++++++++++++++++++++++++++++++++ sys/kern/vfs_nlookup.c | 74 ++++++++++++++++++++++++++++------ sys/kern/vfs_syscalls.c | 1 + sys/sys/malloc.h | 12 ++++-- sys/sys/namecache.h | 6 ++- sys/sys/nlookup.h | 2 +- usr.bin/vmstat/vmstat.c | 22 ++++++++--- 8 files changed, 190 insertions(+), 34 deletions(-) diff --git a/sys/kern/kern_slaballoc.c b/sys/kern/kern_slaballoc.c index 39c7aaa204..1b04d151c8 100644 --- a/sys/kern/kern_slaballoc.c +++ b/sys/kern/kern_slaballoc.c @@ -373,7 +373,7 @@ malloc_uninit(void *data) * negative or positive (canceling each other out). */ for (i = ttl = 0; i < ncpus; ++i) - ttl += type->ks_memuse[i]; + ttl += type->ks_use[i].memuse; if (ttl) { kprintf("malloc_uninit: %ld bytes of '%s' still allocated on cpu %d\n", ttl, type->ks_shortdesc, i); @@ -658,14 +658,14 @@ kmalloc(unsigned long size, struct malloc_type *type, int flags) * * ks_loosememuse is an up-only limit that is NOT MP-synchronized, used * to determine if a more complete limit check should be done. The - * actual memory use is tracked via ks_memuse[cpu]. + * actual memory use is tracked via ks_use[cpu].memuse. */ while (type->ks_loosememuse >= type->ks_limit) { int i; long ttl; for (i = ttl = 0; i < ncpus; ++i) - ttl += type->ks_memuse[i]; + ttl += type->ks_use[i].memuse; type->ks_loosememuse = ttl; /* not MP synchronized */ if ((ssize_t)ttl < 0) /* deal with occassional race */ ttl = 0; @@ -935,8 +935,8 @@ kmalloc(unsigned long size, struct malloc_type *type, int flags) } done: - ++type->ks_inuse[gd->gd_cpuid]; - type->ks_memuse[gd->gd_cpuid] += size; + ++type->ks_use[gd->gd_cpuid].inuse; + type->ks_use[gd->gd_cpuid].memuse += size; type->ks_loosememuse += size; /* not MP synchronized */ crit_exit(); @@ -1244,8 +1244,8 @@ kfree(void *ptr, struct malloc_type *type) * primarily until we can fix softupdate's assumptions about free(). */ crit_enter(); - --type->ks_inuse[gd->gd_cpuid]; - type->ks_memuse[gd->gd_cpuid] -= size; + --type->ks_use[gd->gd_cpuid].inuse; + type->ks_use[gd->gd_cpuid].memuse -= size; if (mycpu->gd_intr_nesting_level || (gd->gd_curthread->td_flags & TDF_INTTHREAD)) { @@ -1289,13 +1289,13 @@ kfree(void *ptr, struct malloc_type *type) if (z->z_CpuGd != gd) { /* * Making these adjustments now allow us to avoid passing (type) - * to the remote cpu. Note that ks_inuse/ks_memuse is being + * to the remote cpu. Note that inuse/memuse is being * adjusted on OUR cpu, not the zone cpu, but it should all still * sum up properly and cancel out. */ crit_enter(); - --type->ks_inuse[gd->gd_cpuid]; - type->ks_memuse[gd->gd_cpuid] -= z->z_ChunkSize; + --type->ks_use[gd->gd_cpuid].inuse; + type->ks_use[gd->gd_cpuid].memuse -= z->z_ChunkSize; crit_exit(); /* @@ -1402,8 +1402,8 @@ kfree(void *ptr, struct malloc_type *type) TAILQ_INSERT_HEAD(&slgd->ZoneAry[z->z_ZoneIndex], z, z_Entry); } - --type->ks_inuse[z->z_Cpu]; - type->ks_memuse[z->z_Cpu] -= z->z_ChunkSize; + --type->ks_use[z->z_Cpu].inuse; + type->ks_use[z->z_Cpu].memuse -= z->z_ChunkSize; check_zone_free(slgd, z); logmemory_quick(free_end); diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 0354a17318..ccdd405f6e 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -278,6 +278,7 @@ struct mntcache { struct mount *mntary[MNTCACHE_COUNT]; struct namecache *ncp1; struct namecache *ncp2; + struct nchandle ncdir; int iter; int unused01; } __cachealign; @@ -354,6 +355,16 @@ cache_clearmntcache(void) if (ncp) _cache_drop(ncp); } + if (cache->ncdir.ncp) { + ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, NULL); + if (ncp) + _cache_drop(ncp); + } + if (cache->ncdir.mount) { + mp = atomic_swap_ptr((void *)&cache->ncdir.mount, NULL); + if (mp) + atomic_add_int(&mp->mnt_refs, -1); + } } } @@ -1032,6 +1043,40 @@ cache_copy(struct nchandle *nch, struct nchandle *target) } } +/* + * Caller wants to copy the current directory, copy it out from our + * pcpu cache if possible (the entire critical path is just two localized + * cmpset ops). If the pcpu cache has a snapshot at all it will be a + * valid one, so we don't have to lock p->p_fd even though we are loading + * two fields. + * + * This has a limited effect since nlookup must still ref and shlock the + * vnode to check perms. We do avoid the per-proc spin-lock though, which + * can aid threaded programs. + */ +void +cache_copy_ncdir(struct proc *p, struct nchandle *target) +{ + struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; + + *target = p->p_fd->fd_ncdir; + if (target->ncp == cache->ncdir.ncp && + target->mount == cache->ncdir.mount) { + if (atomic_cmpset_ptr((void *)&cache->ncdir.ncp, + target->ncp, NULL)) { + if (atomic_cmpset_ptr((void *)&cache->ncdir.mount, + target->mount, NULL)) { + /* CRITICAL PATH */ + return; + } + _cache_drop(target->ncp); + } + } + spin_lock_shared(&p->p_fd->fd_spin); + cache_copy(&p->p_fd->fd_ncdir, target); + spin_unlock_shared(&p->p_fd->fd_spin); +} + void cache_changemount(struct nchandle *nch, struct mount *mp) { @@ -1082,6 +1127,26 @@ done: nch->mount = NULL; } +/* + * We are dropping what the caller believes is the current directory, + * unconditionally store it in our pcpu cache. Anything already in + * the cache will be discarded. + */ +void +cache_drop_ncdir(struct nchandle *nch) +{ + struct mntcache *cache = &pcpu_mntcache[mycpu->gd_cpuid]; + + nch->ncp = atomic_swap_ptr((void *)&cache->ncdir.ncp, nch->ncp); + nch->mount = atomic_swap_ptr((void *)&cache->ncdir.mount, nch->mount); + if (nch->ncp) + _cache_drop(nch->ncp); + if (nch->mount) + _cache_mntrel(nch->mount); + nch->ncp = NULL; + nch->mount = NULL; +} + int cache_lockstatus(struct nchandle *nch) { @@ -1865,6 +1930,24 @@ done: return(TAILQ_FIRST(&vp->v_namecache) != NULL); } +/* + * Clears the universal directory search 'ok' flag. This flag allows + * nlookup() to bypass normal vnode checks. This flag is a cached flag + * so clearing it simply forces revalidation. + */ +void +cache_inval_wxok(struct vnode *vp) +{ + struct namecache *ncp; + + spin_lock(&vp->v_spin); + TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { + if (ncp->nc_flag & NCF_WXOK) + atomic_clear_short(&ncp->nc_flag, NCF_WXOK); + } + spin_unlock(&vp->v_spin); +} + /* * The source ncp has been renamed to the target ncp. Both fncp and tncp * must be locked. The target ncp is destroyed (as a normal rename-over diff --git a/sys/kern/vfs_nlookup.c b/sys/kern/vfs_nlookup.c index daf07a713d..3230cdf5a3 100644 --- a/sys/kern/vfs_nlookup.c +++ b/sys/kern/vfs_nlookup.c @@ -113,12 +113,12 @@ nlookup_init(struct nlookupdata *nd, if (error == 0) { if (p && p->p_fd) { - cache_copy(&p->p_fd->fd_ncdir, &nd->nl_nch); + cache_copy_ncdir(p, &nd->nl_nch); cache_copy(&p->p_fd->fd_nrdir, &nd->nl_rootnch); if (p->p_fd->fd_njdir.ncp) cache_copy(&p->p_fd->fd_njdir, &nd->nl_jailnch); nd->nl_cred = td->td_ucred; - nd->nl_flags |= NLC_BORROWCRED; + nd->nl_flags |= NLC_BORROWCRED | NLC_NCDIR; } else { cache_copy(&rootnch, &nd->nl_nch); cache_copy(&nd->nl_nch, &nd->nl_rootnch); @@ -172,7 +172,12 @@ nlookup_init_at(struct nlookupdata *nd, struct file **fpp, int fd, error = ENOTDIR; goto done; } - cache_drop(&nd->nl_nch); + if (nd->nl_flags & NLC_NCDIR) { + cache_drop_ncdir(&nd->nl_nch); + nd->nl_flags &= ~NLC_NCDIR; + } else { + cache_drop(&nd->nl_nch); + } cache_copy(&fp->f_nchandle, &nd->nl_nch); *fpp = fp; } @@ -308,7 +313,12 @@ nlookup_done(struct nlookupdata *nd) nd->nl_flags &= ~NLC_NCPISLOCKED; cache_unlock(&nd->nl_nch); } - cache_drop(&nd->nl_nch); /* NULL's out the nch */ + if (nd->nl_flags & NLC_NCDIR) { + cache_drop_ncdir(&nd->nl_nch); + nd->nl_flags &= ~NLC_NCDIR; + } else { + cache_drop(&nd->nl_nch); /* NULL's out the nch */ + } } if (nd->nl_rootnch.ncp) cache_drop_and_cache(&nd->nl_rootnch); @@ -467,7 +477,7 @@ nlookup(struct nlookupdata *nd) int len; int dflags; int hit = 1; - int saveflag = nd->nl_flags; + int saveflag = nd->nl_flags & ~NLC_NCDIR; boolean_t doretry = FALSE; boolean_t inretry = FALSE; @@ -517,7 +527,12 @@ nlookup_start: cache_unlock(&nd->nl_nch); cache_get_maybe_shared(&nd->nl_rootnch, &nch, wantsexcllock(nd, ptr)); - cache_drop(&nd->nl_nch); + if (nd->nl_flags & NLC_NCDIR) { + cache_drop_ncdir(&nd->nl_nch); + nd->nl_flags &= ~NLC_NCDIR; + } else { + cache_drop(&nd->nl_nch); + } nd->nl_nch = nch; /* remains locked */ /* @@ -904,7 +919,12 @@ double_break: * element is a directory. */ if (*ptr && (nch.ncp->nc_flag & NCF_ISDIR)) { - cache_drop(&nd->nl_nch); + if (nd->nl_flags & NLC_NCDIR) { + cache_drop_ncdir(&nd->nl_nch); + nd->nl_flags &= ~NLC_NCDIR; + } else { + cache_drop(&nd->nl_nch); + } cache_unlock(&nch); KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0); nd->nl_nch = nch; @@ -956,7 +976,12 @@ double_break: break; } } - cache_drop(&nd->nl_nch); + if (nd->nl_flags & NLC_NCDIR) { + cache_drop_ncdir(&nd->nl_nch); + nd->nl_flags &= ~NLC_NCDIR; + } else { + cache_drop(&nd->nl_nch); + } nd->nl_nch = nch; nd->nl_flags |= NLC_NCPISLOCKED; error = 0; @@ -978,7 +1003,8 @@ double_break: */ if (doretry && !inretry) { inretry = TRUE; - nd->nl_flags = saveflag; + nd->nl_flags &= NLC_NCDIR; + nd->nl_flags |= saveflag; goto nlookup_start; } @@ -1096,6 +1122,9 @@ fail: * The passed ncp must be referenced and locked. If it is already resolved * it may be locked shared but otherwise should be locked exclusively. */ + +#define S_WXOK_MASK (S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) + static int naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp) { @@ -1148,6 +1177,16 @@ naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp) if (error == 0 && (nflags & NLC_EXCL) && ncp->nc_vp != NULL) error = EEXIST; + /* + * Try to short-cut the vnode operation for intermediate directory + * components. This is a major SMP win because it avoids having + * to execute a lot of code for intermediate directory components, + * including shared refs and locks on intermediate directory vnodes. + */ + if (error == 0 && nflags == NLC_EXEC && (ncp->nc_flag & NCF_WXOK)) { + return 0; + } + /* * Get the vnode attributes so we can do the rest of our checks. * @@ -1226,6 +1265,19 @@ naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp) *nflagsp |= NLC_IMMUTABLE; } + /* + * NCF_WXOK can be set for world-searchable directories. + * + * XXX When we implement capabilities this code would also + * need a cap check, or only set the flag if there are no + * capabilities. + */ + cflags = 0; + if (va.va_type == VDIR && + (va.va_mode & S_WXOK_MASK) == S_WXOK_MASK) { + cflags |= NCF_WXOK; + } + /* * Track swapcache management flags in the namecache. * @@ -1234,7 +1286,6 @@ naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp) * (the original cache linkage may have occurred without * getattrs and thus have stale flags). */ - cflags = 0; if (va.va_flags & SF_NOCACHE) cflags |= NCF_SF_NOCACHE; if (va.va_flags & UF_CACHE) @@ -1259,7 +1310,8 @@ naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp) */ atomic_clear_short(&ncp->nc_flag, (NCF_SF_NOCACHE | NCF_UF_CACHE | - NCF_SF_PNOCACHE | NCF_UF_PCACHE) & ~cflags); + NCF_SF_PNOCACHE | NCF_UF_PCACHE | + NCF_WXOK) & ~cflags); atomic_set_short(&ncp->nc_flag, cflags); /* diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 5dba7909b6..bbde8f42d3 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -3176,6 +3176,7 @@ setfmode(struct vnode *vp, int mode) VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; error = VOP_SETATTR(vp, &vattr, td->td_ucred); + cache_inval_wxok(vp); vput(vp); } return error; diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h index d027303a00..a7f3bd4ca9 100644 --- a/sys/sys/malloc.h +++ b/sys/sys/malloc.h @@ -101,13 +101,17 @@ * SMP_MAXCPU is used so modules which use malloc remain compatible * between UP and SMP. */ +struct malloc_use { + size_t memuse; + size_t inuse; +} __cachealign; + struct malloc_type { struct malloc_type *ks_next; /* next in list */ - size_t ks_memuse[SMP_MAXCPU]; /* total memory held in bytes */ size_t ks_loosememuse; /* (inaccurate) aggregate memuse */ size_t ks_limit; /* most that are allowed to exist */ long ks_size; /* sizes of this thing that are allocated */ - size_t ks_inuse[SMP_MAXCPU]; /* # of allocs currently in use */ + struct malloc_use ks_use[SMP_MAXCPU]; __int64_t ks_calls; /* total packets of this type ever allocated */ long ks_maxused; /* maximum number ever used */ __uint32_t ks_magic; /* if it's not magic, don't touch it */ @@ -122,7 +126,7 @@ typedef struct malloc_type *malloc_type_t; #if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) #define MALLOC_DEFINE(type, shortdesc, longdesc) \ struct malloc_type type[1] = { \ - { NULL, { 0 }, 0, 0, 0, { 0 }, 0, 0, M_MAGIC, shortdesc, \ + { NULL, 0, 0, 0, { { 0, 0 } }, 0, 0, M_MAGIC, shortdesc, \ 0, 0, { 0 } } \ }; \ SYSINIT(type##_init, SI_BOOT1_KMALLOC, SI_ORDER_ANY, \ @@ -132,7 +136,7 @@ typedef struct malloc_type *malloc_type_t; #else #define MALLOC_DEFINE(type, shortdesc, longdesc) \ struct malloc_type type[1] = { \ - { NULL, { 0 }, 0, 0, 0, { 0 }, 0, 0, M_MAGIC, shortdesc, 0, 0 } \ + { NULL, 0, 0, 0, { { 0, 0 } }, 0, 0, M_MAGIC, shortdesc, 0, 0 } \ } #endif diff --git a/sys/sys/namecache.h b/sys/sys/namecache.h index 0890370bdc..8a6f95907a 100644 --- a/sys/sys/namecache.h +++ b/sys/sys/namecache.h @@ -143,7 +143,7 @@ struct nchandle { }; /* - * Flags in namecache.nc_flag (u_char) + * Flags in namecache.nc_flag (u_short) */ #define NCF_UNUSED01 0x0001 #define NCF_WHITEOUT 0x0002 /* negative entry corresponds to whiteout */ @@ -157,6 +157,7 @@ struct nchandle { #define NCF_ISDIR 0x0200 /* represents a directory */ #define NCF_DESTROYED 0x0400 /* name association is considered destroyed */ #define NCF_DEFEREDZAP 0x0800 /* zap defered due to lock unavailability */ +#define NCF_WXOK 0x1000 /* world-searchable (nlookup shortcut) */ #define NC_EXLOCK_REQ 0x80000000 /* nc_lockstatus state flag */ #define NC_SHLOCK_REQ 0x40000000 /* nc_lockstatus state flag */ @@ -202,6 +203,7 @@ void cache_unmounting(struct mount *mp); int cache_inval(struct nchandle *nch, int flags); int cache_inval_vp(struct vnode *vp, int flags); int cache_inval_vp_nonblock(struct vnode *vp); +void cache_inval_wxok(struct vnode *vp); void vfs_cache_setroot(struct vnode *vp, struct nchandle *nch); int cache_resolve(struct nchandle *nch, struct ucred *cred); @@ -214,10 +216,12 @@ void cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl); struct nchandle *cache_hold(struct nchandle *nch); void cache_copy(struct nchandle *nch, struct nchandle *target); +void cache_copy_ncdir(struct proc *p, struct nchandle *target); void cache_changemount(struct nchandle *nch, struct mount *mp); void cache_put(struct nchandle *nch); void cache_drop(struct nchandle *nch); void cache_drop_and_cache(struct nchandle *nch); +void cache_drop_ncdir(struct nchandle *nch); void cache_zero(struct nchandle *nch); void cache_rename(struct nchandle *fnch, struct nchandle *tnch); void cache_unlink(struct nchandle *nch); diff --git a/sys/sys/nlookup.h b/sys/sys/nlookup.h index 673e69855d..b76b0d9177 100644 --- a/sys/sys/nlookup.h +++ b/sys/sys/nlookup.h @@ -123,7 +123,7 @@ struct nlookupdata { #define NLC_REFDVP 0x00040000 /* set ref'd/unlocked nl_dvp */ #define NLC_APPEND 0x00100000 /* open check: append */ -#define NLC_UNUSED00200000 0x00200000 +#define NLC_NCDIR 0x00200000 /* nd->nl_nch is ncdir */ #define NLC_READ 0x00400000 /* require read access */ #define NLC_WRITE 0x00800000 /* require write access */ diff --git a/usr.bin/vmstat/vmstat.c b/usr.bin/vmstat/vmstat.c index ef371cea9a..9e45703e98 100644 --- a/usr.bin/vmstat/vmstat.c +++ b/usr.bin/vmstat/vmstat.c @@ -786,14 +786,26 @@ dointr(void) #define MAX_KMSTATS 1024 +enum ksuse { KSINUSE, KSMEMUSE }; + static long -cpuagg(size_t *ary) +cpuagg(struct malloc_type *ks, enum ksuse use) { int i; long ttl; - for (i = ttl = 0; i < SMP_MAXCPU; ++i) - ttl += ary[i]; + ttl = 0; + + switch(use) { + case KSINUSE: + for (i = 0; i < SMP_MAXCPU; ++i) + ttl += ks->ks_use[i].inuse; + break; + case KSMEMUSE: + for (i = 0; i < SMP_MAXCPU; ++i) + ttl += ks->ks_use[i].memuse; + break; + } return(ttl); } @@ -832,7 +844,7 @@ domem(void) continue; printf("%19s%7ld%7ldK%7ldK%11zuK%10jd%5u%6u", ks->ks_shortdesc, - cpuagg(ks->ks_inuse), (cpuagg(ks->ks_memuse) + 1023) / 1024, + cpuagg(ks, KSINUSE), (cpuagg(ks, KSMEMUSE) + 1023) / 1024, (ks->ks_maxused + 1023) / 1024, (ks->ks_limit + 1023) / 1024, (intmax_t)ks->ks_calls, ks->ks_limblocks, ks->ks_mapblocks); @@ -851,7 +863,7 @@ domem(void) first = 0; } printf("\n"); - totuse += cpuagg(ks->ks_memuse); + totuse += cpuagg(ks, KSMEMUSE); totreq += ks->ks_calls; } printf("\nMemory Totals: In Use Free Requests\n"); -- 2.41.0