From 43e72e79e549059473a43a7f99e1b469564c28d0 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 26 Oct 2011 11:18:54 -0700 Subject: [PATCH] kernel - Optimize spinlocks for 48-core contention * Change the spinlock algorithm to do a read-test before atomic_swap_int(). This has no effect on single-chip cpus (tested on phenom II quad-core), but has a HUGE HUGE HUGE effect on multi-chip/many-core systems. On monster (48-core opteron / 4 x 12-core chips) concurrent kernel compile time is reduced from 170 seconds to 75 seconds with this one change. That's well over 100%. The reason the change is important is because it unloads the hardware cache coherency bus and communication by creating a closed-loop with the pre-read, which essentially passively waits for the cache update instead of actively issuing a locked bus cycle memory op. This prevents total armagheddon on the memory busses when a substantial number of cores are doing real work. * Increase the number of pool spinlocks from 1024 to 8192. We need them now that vm_page's use pool spinlocks. --- sys/kern/kern_spinlock.c | 97 ++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/sys/kern/kern_spinlock.c b/sys/kern/kern_spinlock.c index 5e7bc8176c..a736eedc18 100644 --- a/sys/kern/kern_spinlock.c +++ b/sys/kern/kern_spinlock.c @@ -72,8 +72,10 @@ struct indefinite_info { #define SPIN_ARG_SIZE (sizeof(void *) + sizeof(int)) KTR_INFO_MASTER(spin); +#if 0 KTR_INFO(KTR_SPIN_CONTENTION, spin, beg, 0, SPIN_STRING, SPIN_ARG_SIZE); KTR_INFO(KTR_SPIN_CONTENTION, spin, end, 1, SPIN_STRING, SPIN_ARG_SIZE); +#endif #define logspin(name, spin, type) \ KTR_LOG(spin_ ## name, spin, type) @@ -92,13 +94,18 @@ SYSCTL_QUAD(_debug, OID_AUTO, spinlocks_contested2, CTLFLAG_RD, &spinlocks_contested2, 0, "Serious spinlock contention count"); -static int spinlocks_hardloops = 40; -SYSCTL_INT(_debug, OID_AUTO, spinlocks_hardloops, CTLFLAG_RW, - &spinlocks_hardloops, 0, - "Hard loops waiting for spinlock"); +/* + * We need a fairly large pool to avoid contention on large SMP systems, + * particularly multi-chip systems. + */ +/*#define SPINLOCK_NUM_POOL 8101*/ +#define SPINLOCK_NUM_POOL 8192 +#define SPINLOCK_NUM_POOL_MASK (SPINLOCK_NUM_POOL - 1) -#define SPINLOCK_NUM_POOL (1024) -static struct spinlock pool_spinlocks[SPINLOCK_NUM_POOL]; +static __cachealign struct { + struct spinlock spin; + char filler[32 - sizeof(struct spinlock)]; +} pool_spinlocks[SPINLOCK_NUM_POOL]; static int spin_indefinite_check(struct spinlock *spin, struct indefinite_info *info); @@ -125,42 +132,74 @@ spin_trylock_contested(struct spinlock *spin) * * atomic_swap_int() is the absolute fastest spinlock instruction, at * least on multi-socket systems. All instructions seem to be about - * the same on single-socket multi-core systems. + * the same on single-socket multi-core systems. However, atomic_swap_int() + * does not result in an even distribution of successful acquisitions. + * + * Another problem we have is that (at least on the 48-core opteron we test + * with) having all 48 cores contesting the same spin lock reduces + * performance to around 600,000 ops/sec, verses millions when fewer cores + * are going after the same lock. + * + * Backoff algorithms can create even worse starvation problems, and don't + * really improve performance when a lot of cores are contending. + * + * Our solution is to allow the data cache to lazy-update by reading it + * non-atomically and only attempting to acquire the lock if the lazy read + * looks good. This effectively limits cache bus bandwidth. A cpu_pause() + * (for intel/amd anyhow) is not strictly needed as cache bus resource use + * is governed by the lazy update. + * + * WARNING!!!! Performance matters here, by a huge margin. There are still + * a few bottlenecks in the kernel (e.g. the PQ_INACTIVE + * vm_page_queue) where things like parallel compiles hit up + * against full all-cores contention right here. + * + * 48-core test with pre-read / -j 48 no-modules kernel compile + * came in at 75 seconds. Without pre-read it came in at 170 seconds. + * + * 4-core test with pre-read / -j 48 no-modules kernel compile + * came in at 83 seconds. Without pre-read it came in at 83 seconds + * as well (no difference). */ void spin_lock_contested(struct spinlock *spin) { + struct indefinite_info info = { 0, 0 }; int i; i = 0; - while (atomic_swap_int(&spin->counta, 1)) { - cpu_pause(); - if (i == spinlocks_hardloops) { - struct indefinite_info info = { 0, 0 }; - - logspin(beg, spin, 'w'); - while (atomic_swap_int(&spin->counta, 1)) { - cpu_pause(); - ++spin->countb; - if ((++i & 0x7F) == 0x7F) { - if (spin_indefinite_check(spin, &info)) - break; - } - } - logspin(end, spin, 'w'); - return; + ++spin->countb; + + /*logspin(beg, spin, 'w');*/ + for (;;) { + /* + * NOTE: Reading spin->counta prior to the swap is extremely + * important on multi-chip/many-core boxes. On 48-core + * this one change improves fully concurrent all-cores + * compiles by 100% or better. + * + * I can't emphasize enough how important the pre-read is in + * preventing hw cache bus armageddon on multi-chip systems. + * And on single-chip/multi-core systems it just doesn't hurt. + */ + if (spin->counta == 0 && atomic_swap_int(&spin->counta, 1) == 0) + break; + if ((++i & 0x7F) == 0x7F) { + ++spin->countb; + if (spin_indefinite_check(spin, &info)) + break; } - ++spin->countb; - ++i; } + /*logspin(end, spin, 'w');*/ } static __inline int _spin_pool_hash(void *ptr) { int i; - i = ((int) (uintptr_t) ptr >> 2) ^ ((int) (uintptr_t) ptr >> 12); - i &= (SPINLOCK_NUM_POOL - 1); + + i = ((int)(uintptr_t) ptr >> 5) ^ ((int)(uintptr_t)ptr >> 12); + i &= SPINLOCK_NUM_POOL_MASK; return (i); } @@ -169,7 +208,7 @@ _spin_pool_lock(void *chan) { struct spinlock *sp; - sp = &pool_spinlocks[_spin_pool_hash(chan)]; + sp = &pool_spinlocks[_spin_pool_hash(chan)].spin; spin_lock(sp); } @@ -178,7 +217,7 @@ _spin_pool_unlock(void *chan) { struct spinlock *sp; - sp = &pool_spinlocks[_spin_pool_hash(chan)]; + sp = &pool_spinlocks[_spin_pool_hash(chan)].spin; spin_unlock(sp); } -- 2.41.0