Skip to content

Commit

Permalink
Eliminate loop overhead
Browse files Browse the repository at this point in the history
  • Loading branch information
Mes0903 committed Jan 14, 2025
1 parent 03b0560 commit 875258e
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 26 deletions.
64 changes: 39 additions & 25 deletions utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,6 @@
#endif
#endif

#ifndef SEMU_SMP
#define SEMU_SMP 1
#endif

#ifndef SEMU_BOOT_TARGET_TIME
#define SEMU_BOOT_TARGET_TIME 10
#endif

bool boot_complete = false;
static double scale_factor;

Expand All @@ -52,7 +44,7 @@ static inline uint64_t mult_frac(uint64_t x, double n, uint64_t d)
*/
static inline uint64_t get_ticks(struct timespec *ts, double freq)
{
return ts->tv_sec * freq + mult_frac(ts->tv_nsec, freq, 1000000000ULL);
return ts->tv_sec * freq + mult_frac(ts->tv_nsec, freq, 1e9);
}

/* On POSIX => use clock_gettime().
Expand All @@ -67,56 +59,79 @@ static inline uint64_t host_time_ns()
#if defined(HAVE_POSIX_TIMER)
struct timespec ts;
clock_gettime(CLOCKID, &ts);
return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
return (uint64_t) ts.tv_sec * 1e9 + (uint64_t) ts.tv_nsec;

#elif defined(HAVE_MACH_TIMER)
static mach_timebase_info_data_t ts = {0};
if (ts.denom == 0)
(void) mach_timebase_info(&ts);

uint64_t now = mach_absolute_time();
// convert to nanoseconds: (now * t.numer / t.denom)
/* convert to nanoseconds: (now * t.numer / t.denom) */
return mult_frac(now, (double) ts.numer, (uint64_t) ts.denom);

#else
/* Minimal fallback: time(0) in seconds => convert to ns. */
time_t now_sec = time(0);
return (uint64_t) now_sec * 1000000000ULL;
return (uint64_t) now_sec * 1e9;
#endif
}

/* Measure the overhead of a high-resolution timer call, typically
* 'clock_gettime()' on POSIX or 'mach_absolute_time()' on macOS.
*
* 1) Times how long it takes to call 'host_time_ns()' repeatedly (target_loop).
* 1) Times how long it takes to call 'host_time_ns()' repeatedly (iterations).
* 2) Derives an average overhead per call => ns_per_call.
* 3) Because semu_timer_clocksource is ~10% of boot overhead, and called ~2e8
* times * SMP, we get predict_sec = ns_per_call * SMP * 2. Then set
* 'scale_factor' so the entire boot completes in SEMU_BOOT_TARGET_TIME
* seconds.
*/
static void measure_bogomips_ns(uint64_t target_loop)
static void measure_bogomips_ns(uint64_t iterations)
{
/* Mark start time in ns */
uint64_t start_ns = host_time_ns();
/* Perform 'iterations' times calling the host HRT.
*
*
* Assuming the cost of loop overhead is 'e' and the cost of 'host_time_ns'
* is 't', we perform a two-stage measurement to eliminate the loop
* overhead. In the first loop, 'host_time_ns' is called only once per
* iteration, while in the second loop, it is called twice per iteration.
*
* In this way, the cost of the first loop is 'e + t', and the cost of the
* second loop is 'e + 2t'. By subtracting the two, we can effectively
* eliminate the loop overhead.
*
* Reference:
* https://ates.dev/posts/2025-01-12-accurate-benchmarking/
*/
const uint64_t start_ns_1 = host_time_ns();
for (uint64_t loops = 0; loops < iterations; loops++)
(void) host_time_ns();

/* Perform 'target_loop' times calling the host HRT. */
for (uint64_t loops = 0; loops < target_loop; loops++)
const uint64_t end_ns_1 = host_time_ns();
const uint64_t elapsed_ns_1 = end_ns_1 - start_ns_1;

/* Second measurement */
const uint64_t start_ns_2 = host_time_ns();
for (uint64_t loops = 0; loops < iterations; loops++) {
(void) host_time_ns();
(void) host_time_ns();
}

/* Mark end time in ns */
uint64_t end_ns = host_time_ns();
const uint64_t end_ns_2 = host_time_ns();
const uint64_t elapsed_ns_2 = end_ns_2 - start_ns_2;

/* Calculate average overhead per call */
double ns_per_call = (double) (end_ns - start_ns) / (double) target_loop;
const double ns_per_call =
(double) (elapsed_ns_2 - elapsed_ns_1) / (double) iterations;

/* 'semu_timer_clocksource' is called ~2e8 times per SMP. Each call's
* overhead ~ ns_per_call. The total overhead is ~ ns_per_call * SMP * 2e8.
* That overhead is about 10% of the entire boot, so effectively:
* predict_sec = ns_per_call * SMP * 2
* Then scale_factor = (desired_time) / (predict_sec).
*/
double predict_sec = ns_per_call * SEMU_SMP * 2.0;
const double predict_sec = ns_per_call * SEMU_SMP * 2.0;
scale_factor = SEMU_BOOT_TARGET_TIME / predict_sec;
}

Expand All @@ -141,12 +156,11 @@ static uint64_t semu_timer_clocksource(semu_timer_t *timer)
uint64_t now_ns = host_time_ns();

/* real_ticks => (now_ns * freq) / 1e9 */
uint64_t real_ticks =
mult_frac(now_ns, (double) timer->freq, 1000000000ULL);
uint64_t real_ticks = mult_frac(now_ns, (double) timer->freq, 1e9);

/* scaled_ticks => (now_ns * (freq*scale_factor)) / 1e9 */
uint64_t scaled_ticks =
mult_frac(now_ns, (double) (timer->freq * scale_factor), 1000000000ULL);
mult_frac(now_ns, (double) (timer->freq * scale_factor), 1e9);

if (!boot_complete)
return scaled_ticks; /* Return scaled ticks in the boot phase. */
Expand Down
2 changes: 1 addition & 1 deletion utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ typedef struct {
uint64_t freq;
} semu_timer_t;

extern bool boot_complete; /* complete boot process and get in initrd */
extern bool boot_complete; /* Time to reach the first user process. */

void semu_timer_init(semu_timer_t *timer, uint64_t freq);
uint64_t semu_timer_get(semu_timer_t *timer);
Expand Down

0 comments on commit 875258e

Please sign in to comment.