From 875258e21a443510345821f514a5348b1cdb326c Mon Sep 17 00:00:00 2001 From: Mes Date: Wed, 15 Jan 2025 07:51:50 +0800 Subject: [PATCH] Eliminate loop overhead --- utils.c | 64 +++++++++++++++++++++++++++++++++++---------------------- utils.h | 2 +- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/utils.c b/utils.c index 79e0439..60c074c 100644 --- a/utils.c +++ b/utils.c @@ -19,14 +19,6 @@ #endif #endif -#ifndef SEMU_SMP -#define SEMU_SMP 1 -#endif - -#ifndef SEMU_BOOT_TARGET_TIME -#define SEMU_BOOT_TARGET_TIME 10 -#endif - bool boot_complete = false; static double scale_factor; @@ -52,7 +44,7 @@ static inline uint64_t mult_frac(uint64_t x, double n, uint64_t d) */ static inline uint64_t get_ticks(struct timespec *ts, double freq) { - return ts->tv_sec * freq + mult_frac(ts->tv_nsec, freq, 1000000000ULL); + return ts->tv_sec * freq + mult_frac(ts->tv_nsec, freq, 1e9); } /* On POSIX => use clock_gettime(). @@ -67,7 +59,7 @@ static inline uint64_t host_time_ns() #if defined(HAVE_POSIX_TIMER) struct timespec ts; clock_gettime(CLOCKID, &ts); - return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; + return (uint64_t) ts.tv_sec * 1e9 + (uint64_t) ts.tv_nsec; #elif defined(HAVE_MACH_TIMER) static mach_timebase_info_data_t ts = {0}; @@ -75,40 +67,63 @@ static inline uint64_t host_time_ns() (void) mach_timebase_info(&ts); uint64_t now = mach_absolute_time(); - // convert to nanoseconds: (now * t.numer / t.denom) + /* convert to nanoseconds: (now * t.numer / t.denom) */ return mult_frac(now, (double) ts.numer, (uint64_t) ts.denom); #else /* Minimal fallback: time(0) in seconds => convert to ns. */ time_t now_sec = time(0); - return (uint64_t) now_sec * 1000000000ULL; + return (uint64_t) now_sec * 1e9; #endif } /* Measure the overhead of a high-resolution timer call, typically * 'clock_gettime()' on POSIX or 'mach_absolute_time()' on macOS. * - * 1) Times how long it takes to call 'host_time_ns()' repeatedly (target_loop). + * 1) Times how long it takes to call 'host_time_ns()' repeatedly (iterations). * 2) Derives an average overhead per call => ns_per_call. * 3) Because semu_timer_clocksource is ~10% of boot overhead, and called ~2e8 * times * SMP, we get predict_sec = ns_per_call * SMP * 2. Then set * 'scale_factor' so the entire boot completes in SEMU_BOOT_TARGET_TIME * seconds. */ -static void measure_bogomips_ns(uint64_t target_loop) +static void measure_bogomips_ns(uint64_t iterations) { - /* Mark start time in ns */ - uint64_t start_ns = host_time_ns(); + /* Perform 'iterations' times calling the host HRT. + * + * + * Assuming the cost of loop overhead is 'e' and the cost of 'host_time_ns' + * is 't', we perform a two-stage measurement to eliminate the loop + * overhead. In the first loop, 'host_time_ns' is called only once per + * iteration, while in the second loop, it is called twice per iteration. + * + * In this way, the cost of the first loop is 'e + t', and the cost of the + * second loop is 'e + 2t'. By subtracting the two, we can effectively + * eliminate the loop overhead. + * + * Reference: + * https://ates.dev/posts/2025-01-12-accurate-benchmarking/ + */ + const uint64_t start_ns_1 = host_time_ns(); + for (uint64_t loops = 0; loops < iterations; loops++) + (void) host_time_ns(); - /* Perform 'target_loop' times calling the host HRT. */ - for (uint64_t loops = 0; loops < target_loop; loops++) + const uint64_t end_ns_1 = host_time_ns(); + const uint64_t elapsed_ns_1 = end_ns_1 - start_ns_1; + + /* Second measurement */ + const uint64_t start_ns_2 = host_time_ns(); + for (uint64_t loops = 0; loops < iterations; loops++) { + (void) host_time_ns(); (void) host_time_ns(); + } - /* Mark end time in ns */ - uint64_t end_ns = host_time_ns(); + const uint64_t end_ns_2 = host_time_ns(); + const uint64_t elapsed_ns_2 = end_ns_2 - start_ns_2; /* Calculate average overhead per call */ - double ns_per_call = (double) (end_ns - start_ns) / (double) target_loop; + const double ns_per_call = + (double) (elapsed_ns_2 - elapsed_ns_1) / (double) iterations; /* 'semu_timer_clocksource' is called ~2e8 times per SMP. Each call's * overhead ~ ns_per_call. The total overhead is ~ ns_per_call * SMP * 2e8. @@ -116,7 +131,7 @@ static void measure_bogomips_ns(uint64_t target_loop) * predict_sec = ns_per_call * SMP * 2 * Then scale_factor = (desired_time) / (predict_sec). */ - double predict_sec = ns_per_call * SEMU_SMP * 2.0; + const double predict_sec = ns_per_call * SEMU_SMP * 2.0; scale_factor = SEMU_BOOT_TARGET_TIME / predict_sec; } @@ -141,12 +156,11 @@ static uint64_t semu_timer_clocksource(semu_timer_t *timer) uint64_t now_ns = host_time_ns(); /* real_ticks => (now_ns * freq) / 1e9 */ - uint64_t real_ticks = - mult_frac(now_ns, (double) timer->freq, 1000000000ULL); + uint64_t real_ticks = mult_frac(now_ns, (double) timer->freq, 1e9); /* scaled_ticks => (now_ns * (freq*scale_factor)) / 1e9 */ uint64_t scaled_ticks = - mult_frac(now_ns, (double) (timer->freq * scale_factor), 1000000000ULL); + mult_frac(now_ns, (double) (timer->freq * scale_factor), 1e9); if (!boot_complete) return scaled_ticks; /* Return scaled ticks in the boot phase. */ diff --git a/utils.h b/utils.h index f893b7e..1b05082 100644 --- a/utils.h +++ b/utils.h @@ -9,7 +9,7 @@ typedef struct { uint64_t freq; } semu_timer_t; -extern bool boot_complete; /* complete boot process and get in initrd */ +extern bool boot_complete; /* Time to reach the first user process. */ void semu_timer_init(semu_timer_t *timer, uint64_t freq); uint64_t semu_timer_get(semu_timer_t *timer);