From df5c0f32875b9ea77d002c2f672af88fec0b7ff7 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 13 Feb 2020 21:39:17 -0800 Subject: [PATCH] kernel - Reduce excessive rdrand harvesting * Our rdrand driver harvests 512 bytes on each cpu thread at a rate of 10hz. Ryzen CPUs appear to burn about 0.73uS per word, creating an overhead of about 460uS/sec on EACH cpu thread in the system. When added to the even higher overhead of the add_buffer_randomness() call, the result was a roughly 3% loss of performance across the board. * Reduce the harvest size to 16 bytes, which honestly is still plenty of entropy to inject. * Change some symbolic branch targets to local branch targets in the rdrand and padlock code to avoid generating symbols that can cause weird output in our PC sampler (I was getting 'loop+N' and 'out+N' while testing the above). --- sys/dev/crypto/padlock/rng_harvest_x86_64.S | 8 ++++---- sys/dev/crypto/rdrand/rdrand.c | 14 +++++++++++++- sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S | 8 ++++---- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/sys/dev/crypto/padlock/rng_harvest_x86_64.S b/sys/dev/crypto/padlock/rng_harvest_x86_64.S index d355d151ec..3de64e3a86 100644 --- a/sys/dev/crypto/padlock/rng_harvest_x86_64.S +++ b/sys/dev/crypto/padlock/rng_harvest_x86_64.S @@ -37,7 +37,7 @@ ENTRY(padlock_rng) movq $3, %rdx xorq %r11, %r11 -loop: +1: /* * edx: (input) quality factor of rng entropy * rdi: (input) buffer for random data @@ -54,7 +54,7 @@ loop: * available, so we finish up. */ andq $0x1f, %rax - jz out + jz 2f /* * Increment the count of stored random bytes. The buffer pointer @@ -67,8 +67,8 @@ loop: * loop again. */ cmpq %rcx, %r11 - jl loop -out: + jl 1b +2: /* return the number of stored random bytes. */ movq %r11, %rax ret diff --git a/sys/dev/crypto/rdrand/rdrand.c b/sys/dev/crypto/rdrand/rdrand.c index a04697b30b..ccfb92e0cc 100644 --- a/sys/dev/crypto/rdrand/rdrand.c +++ b/sys/dev/crypto/rdrand/rdrand.c @@ -39,8 +39,20 @@ #include +/* + * WARNING! + * + * The RDRAND instruction is a very slow instruction, burning approximately + * 0.79uS per 64-bit word on a modern ryzen cpu. Intel cpu's run this + * instruction far more quickly. The quality of the results are unknown + * either way. The add_buffer_randomness() call is also not cheap. + * + * Our code harvests at a 10hz rate on every single core, and also chains + * some entropy from core to core so honestly it doesn't take much to really + * mix things up. Use a decent size (16 or 32 bytes should be good). + */ #define RDRAND_ALIGN(p) (void *)(roundup2((uintptr_t)(p), 16)) -#define RDRAND_SIZE 512 +#define RDRAND_SIZE 16 static int rdrand_debug; SYSCTL_INT(_debug, OID_AUTO, rdrand, CTLFLAG_RW, &rdrand_debug, 0, diff --git a/sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S b/sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S index a1e67d2560..9ac14c6a6e 100644 --- a/sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S +++ b/sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S @@ -37,7 +37,7 @@ ENTRY(rdrand_rng) shrq $3, %rcx /* Divide by 8 to get 64-bit word count */ xorq %r11, %r11 -loop: +1: /* * rdx: (output) entropy */ @@ -49,7 +49,7 @@ loop: * * CF = 0: Random value not available at time of execution. */ - jae out + jae 2f /* * The operation was successful, so store the random data @@ -69,8 +69,8 @@ loop: * loop again. */ cmpq %rcx, %r11 - jl loop -out: + jl 1b +2: /* return the number of stored random bytes (random words * 8) */ shlq $3, %r11 movq %r11, %rax -- 2.41.0