From 1eb5a42bfcae00196f0a11051408d9e9dca9968d Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 17 Aug 2018 16:52:45 -0700 Subject: [PATCH] kernel - Refactor the TSC MP synchronization test * Refactor the TSC MP synchronization test. Do not use cpusync. Using cpusync results in O(N x N) worth of overhead instead of O(N) worth of overhead. Instead, have the per-cpu threads run the test simultaneously using each other's data. * We synchronize to the last TSC element that was saved on each cpu. This probably needs a bit of work to ensure determinism, but at the moment its good in that it synchronizes all cores off of a single cache mastership change, instead of having them all compete for cache mastership. * Probably needs some fine tuning, at the moment I allow a slop of 10uS which is almost certainly too much. Note, however, that SMP interactions can create ~1uS latencies on particular memory accesses. * Solves serious issues with the old test on 64 cpu threads. These issues may also have been related to the ipiq fifo size being too small. --- sys/platform/pc64/isa/clock.c | 150 +++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 48 deletions(-) diff --git a/sys/platform/pc64/isa/clock.c b/sys/platform/pc64/isa/clock.c index 698c1d8172..741685033c 100644 --- a/sys/platform/pc64/isa/clock.c +++ b/sys/platform/pc64/isa/clock.c @@ -208,6 +208,20 @@ static struct cputimer_intr i8254_cputimer_intr = { .priv = NULL }; +/* + * Use this to lwkt_switch() when the scheduler clock is not + * yet running, otherwise lwkt_switch() won't do anything. + * XXX needs cleaning up in lwkt_thread.c + */ +static void +lwkt_force_switch(void) +{ + crit_enter(); + lwkt_schedulerclock(curthread); + crit_exit(); + lwkt_switch(); +} + /* * timer0 clock interrupt. Timer0 is in one-shot mode and has stopped * counting as of this interrupt. We use timer1 in free-running mode (not @@ -1393,29 +1407,20 @@ hw_i8254_timestamp(SYSCTL_HANDLER_ARGS) return(SYSCTL_OUT(req, buf, strlen(buf) + 1)); } -struct tsc_mpsync_arg { - volatile uint64_t tsc_target; - volatile int tsc_mpsync; -}; - -struct tsc_mpsync_thr { +struct tsc_mpsync_info { + volatile int tsc_ready_cnt; volatile int tsc_done_cnt; - volatile int tsc_mpsync_cnt; -}; - -static void -tsc_mpsync_test_remote(void *xarg) -{ - struct tsc_mpsync_arg *arg = xarg; - uint64_t tsc; - - tsc = rdtsc_ordered(); - if (tsc < arg->tsc_target) - arg->tsc_mpsync = 0; -} + volatile int tsc_command; + volatile int unused01[5]; + struct { + uint64_t v; + uint64_t unused02; + } tsc_saved[MAXCPU]; +} __cachealign; +#if 0 static void -tsc_mpsync_test_loop(struct tsc_mpsync_arg *arg) +tsc_mpsync_test_loop(struct tsc_mpsync_thr *info) { struct globaldata *gd = mycpu; tsc_uclock_t test_end, test_begin; @@ -1476,18 +1481,35 @@ tsc_mpsync_test_loop(struct tsc_mpsync_arg *arg) } } +#endif + +#define TSC_TEST_COUNT 50000 + static void -tsc_mpsync_ap_thread(void *xthr) +tsc_mpsync_ap_thread(void *xinfo) { - struct tsc_mpsync_thr *thr = xthr; - struct tsc_mpsync_arg arg; + struct tsc_mpsync_info *info = xinfo; + int cpu = mycpuid; + int i; - tsc_mpsync_test_loop(&arg); - if (arg.tsc_mpsync) { - atomic_add_int(&thr->tsc_mpsync_cnt, 1); - cpu_sfence(); + /* + * Tell main loop that we are ready and wait for initiation + */ + atomic_add_int(&info->tsc_ready_cnt, 1); + while (info->tsc_command == 0) { + lwkt_force_switch(); } - atomic_add_int(&thr->tsc_done_cnt, 1); + + /* + * Run test for 10000 loops or until tsc_done_cnt != 0 (another + * cpu has finished its test), then increment done. + */ + crit_enter(); + for (i = 0; i < TSC_TEST_COUNT && info->tsc_done_cnt == 0; ++i) { + info->tsc_saved[cpu].v = rdtsc_ordered(); + } + crit_exit(); + atomic_add_int(&info->tsc_done_cnt, 1); lwkt_exit(); } @@ -1495,7 +1517,8 @@ tsc_mpsync_ap_thread(void *xthr) static void tsc_mpsync_test(void) { - struct tsc_mpsync_arg arg; + int cpu; + int try; if (!tsc_invariant) { /* Not even invariant TSC */ @@ -1553,37 +1576,68 @@ tsc_mpsync_test(void) } /* - * Test even if forced above. If forced, we will use the TSC - * even if the test fails. + * Test even if forced to 1 above. If forced, we will use the TSC + * even if the test fails. (set forced to -1 to disable entirely). */ kprintf("TSC testing MP synchronization ...\n"); - tsc_mpsync_test_loop(&arg); - if (arg.tsc_mpsync) { - struct tsc_mpsync_thr thr; - int cpu; + /* + * Test TSC MP synchronization on APs. Try up to 4 times. + */ + for (try = 0; try < 4; ++try) { + struct tsc_mpsync_info info; + uint64_t last; + int64_t xdelta; + int64_t delta; + + bzero(&info, sizeof(info)); + + for (cpu = 0; cpu < ncpus; ++cpu) { + thread_t td; + lwkt_create(tsc_mpsync_ap_thread, &info, &td, + NULL, TDF_NOSTART, cpu, + "tsc mpsync %d", cpu); + lwkt_setpri_initial(td, curthread->td_pri); + lwkt_schedule(td); + } + while (info.tsc_ready_cnt != ncpus) + lwkt_force_switch(); /* - * Test TSC MP synchronization on APs. + * All threads are ready, start the test and wait for + * completion. */ + info.tsc_command = 1; + while (info.tsc_done_cnt != ncpus) + lwkt_force_switch(); - thr.tsc_done_cnt = 1; - thr.tsc_mpsync_cnt = 1; - + /* + * Process results + */ + last = info.tsc_saved[0].v; + delta = 0; for (cpu = 0; cpu < ncpus; ++cpu) { - if (cpu == mycpuid) - continue; + xdelta = (int64_t)(info.tsc_saved[cpu].v - last); + last = info.tsc_saved[cpu].v; + if (xdelta < 0) + xdelta = -xdelta; + delta += xdelta; - lwkt_create(tsc_mpsync_ap_thread, &thr, NULL, - NULL, 0, cpu, "tsc mpsync %d", cpu); } - while (thr.tsc_done_cnt != ncpus) { - cpu_pause(); - cpu_lfence(); - } - if (thr.tsc_mpsync_cnt == ncpus) + /* + * Result from attempt. If its too wild just stop now. + * Also break out if we succeed, no need to try further. + */ + kprintf("TSC MPSYNC TEST %jd %d -> %jd (10uS=%jd)\n", + delta, ncpus, delta / ncpus, + tsc_frequency / 100000); + if (delta / ncpus > tsc_frequency / 100) + break; + if (delta / ncpus < tsc_frequency / 100000) { tsc_mpsync = 1; + break; + } } if (tsc_mpsync) -- 2.41.0