From df5c0f32875b9ea77d002c2f672af88fec0b7ff7 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Thu, 13 Feb 2020 21:39:17 -0800
Subject: [PATCH] kernel - Reduce excessive rdrand harvesting

* Our rdrand driver harvests 512 bytes on each cpu thread at a rate
  of 10hz.  Ryzen CPUs appear to burn about 0.73uS per word, creating
  an overhead of about 460uS/sec on EACH cpu thread in the system.

  When added to the even higher overhead of the add_buffer_randomness()
  call, the result was a roughly 3% loss of performance across the board.

* Reduce the harvest size to 16 bytes, which honestly is still plenty
  of entropy to inject.

* Change some symbolic branch targets to local branch targets in the
  rdrand and padlock code to avoid generating symbols that can cause
  weird output in our PC sampler (I was getting 'loop+N' and 'out+N'
  while testing the above).
---
 sys/dev/crypto/padlock/rng_harvest_x86_64.S   |  8 ++++----
 sys/dev/crypto/rdrand/rdrand.c                | 14 +++++++++++++-
 sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S |  8 ++++----
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/sys/dev/crypto/padlock/rng_harvest_x86_64.S b/sys/dev/crypto/padlock/rng_harvest_x86_64.S
index d355d151ec..3de64e3a86 100644
--- a/sys/dev/crypto/padlock/rng_harvest_x86_64.S
+++ b/sys/dev/crypto/padlock/rng_harvest_x86_64.S
@@ -37,7 +37,7 @@ ENTRY(padlock_rng)
 
 	movq	$3,	%rdx
 	xorq	%r11, 	%r11
-loop:
+1:
 	/*
 	 * edx: (input)		quality factor of rng entropy
 	 * rdi: (input)		buffer for random data
@@ -54,7 +54,7 @@ loop:
 	 * available, so we finish up.
 	 */
 	andq	$0x1f, 	%rax
-	jz	out
+	jz	2f
 
 	/*
 	 * Increment the count of stored random bytes. The buffer pointer
@@ -67,8 +67,8 @@ loop:
 	 * loop again.
 	 */
 	cmpq	%rcx, 	%r11
-	jl	loop
-out:
+	jl	1b
+2:
 	/* return the number of stored random bytes. */
 	movq	%r11, 	%rax
 	ret
diff --git a/sys/dev/crypto/rdrand/rdrand.c b/sys/dev/crypto/rdrand/rdrand.c
index a04697b30b..ccfb92e0cc 100644
--- a/sys/dev/crypto/rdrand/rdrand.c
+++ b/sys/dev/crypto/rdrand/rdrand.c
@@ -39,8 +39,20 @@
 
 #include <machine/specialreg.h>
 
+/*
+ * WARNING!
+ *
+ * The RDRAND instruction is a very slow instruction, burning approximately
+ * 0.79uS per 64-bit word on a modern ryzen cpu.  Intel cpu's run this
+ * instruction far more quickly.  The quality of the results are unknown
+ * either way.  The add_buffer_randomness() call is also not cheap.
+ *
+ * Our code harvests at a 10hz rate on every single core, and also chains
+ * some entropy from core to core so honestly it doesn't take much to really
+ * mix things up.  Use a decent size (16 or 32 bytes should be good).
+ */
 #define	RDRAND_ALIGN(p)	(void *)(roundup2((uintptr_t)(p), 16))
-#define RDRAND_SIZE	512
+#define RDRAND_SIZE	16
 
 static int rdrand_debug;
 SYSCTL_INT(_debug, OID_AUTO, rdrand, CTLFLAG_RW, &rdrand_debug, 0,
diff --git a/sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S b/sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S
index a1e67d2560..9ac14c6a6e 100644
--- a/sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S
+++ b/sys/dev/crypto/rdrand/rdrand_harvest_x86_64.S
@@ -37,7 +37,7 @@ ENTRY(rdrand_rng)
 	shrq	$3,	%rcx	/* Divide by 8 to get 64-bit word count */
 
 	xorq	%r11, 	%r11
-loop:
+1:
 	/*
 	 * rdx: (output)	entropy
 	 */
@@ -49,7 +49,7 @@ loop:
 	 *
 	 * CF = 0: Random value not available at time of execution.
 	 */
-	jae	out
+	jae	2f
 
 	/*
 	 * The operation was successful, so store the random data
@@ -69,8 +69,8 @@ loop:
 	 * loop again.
 	 */
 	cmpq	%rcx, 	%r11
-	jl	loop
-out:
+	jl	1b
+2:
 	/* return the number of stored random bytes (random words * 8) */
 	shlq	$3,	%r11
 	movq	%r11, 	%rax
-- 
2.41.0