From 1eb5a42bfcae00196f0a11051408d9e9dca9968d Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Fri, 17 Aug 2018 16:52:45 -0700
Subject: [PATCH] kernel - Refactor the TSC MP synchronization test

* Refactor the TSC MP synchronization test.  Do not use cpusync.
  Using cpusync results in O(N x N) worth of overhead instead of
  O(N) worth of overhead.

  Instead, have the per-cpu threads run the test simultaneously using
  each other's data.

* We synchronize to the last TSC element that was saved on each cpu.
  This probably needs a bit of work to ensure determinism, but at
  the moment its good in that it synchronizes all cores off of a
  single cache mastership change, instead of having them all compete
  for cache mastership.

* Probably needs some fine tuning, at the moment I allow a slop of
  10uS which is almost certainly too much.  Note, however, that
  SMP interactions can create ~1uS latencies on particular memory
  accesses.

* Solves serious issues with the old test on 64 cpu threads.
  These issues may also have been related to the ipiq fifo size
  being too small.
---
 sys/platform/pc64/isa/clock.c | 150 +++++++++++++++++++++++-----------
 1 file changed, 102 insertions(+), 48 deletions(-)

diff --git a/sys/platform/pc64/isa/clock.c b/sys/platform/pc64/isa/clock.c
index 698c1d8172..741685033c 100644
--- a/sys/platform/pc64/isa/clock.c
+++ b/sys/platform/pc64/isa/clock.c
@@ -208,6 +208,20 @@ static struct cputimer_intr i8254_cputimer_intr = {
     .priv = NULL
 };
 
+/*
+ * Use this to lwkt_switch() when the scheduler clock is not
+ * yet running, otherwise lwkt_switch() won't do anything.
+ * XXX needs cleaning up in lwkt_thread.c
+ */
+static void
+lwkt_force_switch(void)
+{
+	crit_enter();
+	lwkt_schedulerclock(curthread);
+	crit_exit();
+	lwkt_switch();
+}
+
 /*
  * timer0 clock interrupt.  Timer0 is in one-shot mode and has stopped
  * counting as of this interrupt.  We use timer1 in free-running mode (not
@@ -1393,29 +1407,20 @@ hw_i8254_timestamp(SYSCTL_HANDLER_ARGS)
     return(SYSCTL_OUT(req, buf, strlen(buf) + 1));
 }
 
-struct tsc_mpsync_arg {
-	volatile uint64_t	tsc_target;
-	volatile int		tsc_mpsync;
-};
-
-struct tsc_mpsync_thr {
+struct tsc_mpsync_info {
+	volatile int		tsc_ready_cnt;
 	volatile int		tsc_done_cnt;
-	volatile int		tsc_mpsync_cnt;
-};
-
-static void
-tsc_mpsync_test_remote(void *xarg)
-{
-	struct tsc_mpsync_arg *arg = xarg;
-	uint64_t tsc;
-
-	tsc = rdtsc_ordered();
-	if (tsc < arg->tsc_target)
-		arg->tsc_mpsync = 0;
-}
+	volatile int		tsc_command;
+	volatile int		unused01[5];
+	struct {
+		uint64_t	v;
+		uint64_t	unused02;
+	} tsc_saved[MAXCPU];
+} __cachealign;
 
+#if 0
 static void
-tsc_mpsync_test_loop(struct tsc_mpsync_arg *arg)
+tsc_mpsync_test_loop(struct tsc_mpsync_thr *info)
 {
 	struct globaldata *gd = mycpu;
 	tsc_uclock_t test_end, test_begin;
@@ -1476,18 +1481,35 @@ tsc_mpsync_test_loop(struct tsc_mpsync_arg *arg)
 	}
 }
 
+#endif
+
+#define TSC_TEST_COUNT		50000
+
 static void
-tsc_mpsync_ap_thread(void *xthr)
+tsc_mpsync_ap_thread(void *xinfo)
 {
-	struct tsc_mpsync_thr *thr = xthr;
-	struct tsc_mpsync_arg arg;
+	struct tsc_mpsync_info *info = xinfo;
+	int cpu = mycpuid;
+	int i;
 
-	tsc_mpsync_test_loop(&arg);
-	if (arg.tsc_mpsync) {
-		atomic_add_int(&thr->tsc_mpsync_cnt, 1);
-		cpu_sfence();
+	/*
+	 * Tell main loop that we are ready and wait for initiation
+	 */
+	atomic_add_int(&info->tsc_ready_cnt, 1);
+	while (info->tsc_command == 0) {
+		lwkt_force_switch();
 	}
-	atomic_add_int(&thr->tsc_done_cnt, 1);
+
+	/*
+	 * Run test for 10000 loops or until tsc_done_cnt != 0 (another
+	 * cpu has finished its test), then increment done.
+	 */
+	crit_enter();
+	for (i = 0; i < TSC_TEST_COUNT && info->tsc_done_cnt == 0; ++i) {
+		info->tsc_saved[cpu].v = rdtsc_ordered();
+	}
+	crit_exit();
+	atomic_add_int(&info->tsc_done_cnt, 1);
 
 	lwkt_exit();
 }
@@ -1495,7 +1517,8 @@ tsc_mpsync_ap_thread(void *xthr)
 static void
 tsc_mpsync_test(void)
 {
-	struct tsc_mpsync_arg arg;
+	int cpu;
+	int try;
 
 	if (!tsc_invariant) {
 		/* Not even invariant TSC */
@@ -1553,37 +1576,68 @@ tsc_mpsync_test(void)
 	}
 
 	/*
-	 * Test even if forced above.  If forced, we will use the TSC
-	 * even if the test fails.
+	 * Test even if forced to 1 above.  If forced, we will use the TSC
+	 * even if the test fails.  (set forced to -1 to disable entirely).
 	 */
 	kprintf("TSC testing MP synchronization ...\n");
 
-	tsc_mpsync_test_loop(&arg);
-	if (arg.tsc_mpsync) {
-		struct tsc_mpsync_thr thr;
-		int cpu;
+	/*
+	 * Test TSC MP synchronization on APs.  Try up to 4 times.
+	 */
+	for (try = 0; try < 4; ++try) {
+		struct tsc_mpsync_info info;
+		uint64_t last;
+		int64_t xdelta;
+		int64_t delta;
+
+		bzero(&info, sizeof(info));
+
+		for (cpu = 0; cpu < ncpus; ++cpu) {
+			thread_t td;
+			lwkt_create(tsc_mpsync_ap_thread, &info, &td,
+				    NULL, TDF_NOSTART, cpu,
+				    "tsc mpsync %d", cpu);
+			lwkt_setpri_initial(td, curthread->td_pri);
+			lwkt_schedule(td);
+		}
+		while (info.tsc_ready_cnt != ncpus)
+			lwkt_force_switch();
 
 		/*
-		 * Test TSC MP synchronization on APs.
+		 * All threads are ready, start the test and wait for
+		 * completion.
 		 */
+		info.tsc_command = 1;
+		while (info.tsc_done_cnt != ncpus)
+			lwkt_force_switch();
 
-		thr.tsc_done_cnt = 1;
-		thr.tsc_mpsync_cnt = 1;
-
+		/*
+		 * Process results
+		 */
+		last = info.tsc_saved[0].v;
+		delta = 0;
 		for (cpu = 0; cpu < ncpus; ++cpu) {
-			if (cpu == mycpuid)
-				continue;
+			xdelta = (int64_t)(info.tsc_saved[cpu].v - last);
+			last = info.tsc_saved[cpu].v;
+			if (xdelta < 0)
+				xdelta = -xdelta;
+			delta += xdelta;
 
-			lwkt_create(tsc_mpsync_ap_thread, &thr, NULL,
-			    NULL, 0, cpu, "tsc mpsync %d", cpu);
 		}
 
-		while (thr.tsc_done_cnt != ncpus) {
-			cpu_pause();
-			cpu_lfence();
-		}
-		if (thr.tsc_mpsync_cnt == ncpus)
+		/*
+		 * Result from attempt.  If its too wild just stop now.
+		 * Also break out if we succeed, no need to try further.
+		 */
+		kprintf("TSC MPSYNC TEST %jd %d -> %jd (10uS=%jd)\n",
+			delta, ncpus, delta / ncpus,
+			tsc_frequency / 100000);
+		if (delta / ncpus > tsc_frequency / 100)
+			break;
+		if (delta / ncpus < tsc_frequency / 100000) {
 			tsc_mpsync = 1;
+			break;
+		}
 	}
 
 	if (tsc_mpsync)
-- 
2.41.0