Eliminate loop overhead

sysprog21 · Jan 14, 2025 · 875258e · 875258e
1 parent 03b0560
commit 875258e
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 26 deletions.
diff --git a/utils.c b/utils.c
@@ -19,14 +19,6 @@
 #endif
 #endif
 
-#ifndef SEMU_SMP
-#define SEMU_SMP 1
-#endif
-
-#ifndef SEMU_BOOT_TARGET_TIME
-#define SEMU_BOOT_TARGET_TIME 10
-#endif
-
 bool boot_complete = false;
 static double scale_factor;
 
@@ -52,7 +44,7 @@ static inline uint64_t mult_frac(uint64_t x, double n, uint64_t d)
  */
 static inline uint64_t get_ticks(struct timespec *ts, double freq)
 {
-    return ts->tv_sec * freq + mult_frac(ts->tv_nsec, freq, 1000000000ULL);
+    return ts->tv_sec * freq + mult_frac(ts->tv_nsec, freq, 1e9);
 }
 
 /* On POSIX => use clock_gettime().
@@ -67,56 +59,79 @@ static inline uint64_t host_time_ns()
 #if defined(HAVE_POSIX_TIMER)
     struct timespec ts;
     clock_gettime(CLOCKID, &ts);
-    return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
+    return (uint64_t) ts.tv_sec * 1e9 + (uint64_t) ts.tv_nsec;
 
 #elif defined(HAVE_MACH_TIMER)
     static mach_timebase_info_data_t ts = {0};
     if (ts.denom == 0)
         (void) mach_timebase_info(&ts);
 
     uint64_t now = mach_absolute_time();
-    // convert to nanoseconds: (now * t.numer / t.denom)
+    /* convert to nanoseconds: (now * t.numer / t.denom) */
     return mult_frac(now, (double) ts.numer, (uint64_t) ts.denom);
 
 #else
     /* Minimal fallback: time(0) in seconds => convert to ns. */
     time_t now_sec = time(0);
-    return (uint64_t) now_sec * 1000000000ULL;
+    return (uint64_t) now_sec * 1e9;
 #endif
 }
 
 /* Measure the overhead of a high-resolution timer call, typically
  * 'clock_gettime()' on POSIX or 'mach_absolute_time()' on macOS.
  *
- * 1) Times how long it takes to call 'host_time_ns()' repeatedly (target_loop).
+ * 1) Times how long it takes to call 'host_time_ns()' repeatedly (iterations).
  * 2) Derives an average overhead per call => ns_per_call.
  * 3) Because semu_timer_clocksource is ~10% of boot overhead, and called ~2e8
  *    times * SMP, we get predict_sec = ns_per_call * SMP * 2. Then set
  *    'scale_factor' so the entire boot completes in SEMU_BOOT_TARGET_TIME
  *    seconds.
  */
-static void measure_bogomips_ns(uint64_t target_loop)
+static void measure_bogomips_ns(uint64_t iterations)
 {
-    /* Mark start time in ns */
-    uint64_t start_ns = host_time_ns();
+    /* Perform 'iterations' times calling the host HRT.
+     *
+     *
+     * Assuming the cost of loop overhead is 'e' and the cost of 'host_time_ns'
+     * is 't', we perform a two-stage measurement to eliminate the loop
+     * overhead. In the first loop, 'host_time_ns' is called only once per
+     * iteration, while in the second loop, it is called twice per iteration.
+     *
+     * In this way, the cost of the first loop is 'e + t', and the cost of the
+     * second loop is 'e + 2t'. By subtracting the two, we can effectively
+     * eliminate the loop overhead.
+     *
+     * Reference:
+     * https://ates.dev/posts/2025-01-12-accurate-benchmarking/
+     */
+    const uint64_t start_ns_1 = host_time_ns();
+    for (uint64_t loops = 0; loops < iterations; loops++)
+        (void) host_time_ns();
 
-    /* Perform 'target_loop' times calling the host HRT. */
-    for (uint64_t loops = 0; loops < target_loop; loops++)
+    const uint64_t end_ns_1 = host_time_ns();
+    const uint64_t elapsed_ns_1 = end_ns_1 - start_ns_1;
+
+    /* Second measurement */
+    const uint64_t start_ns_2 = host_time_ns();
+    for (uint64_t loops = 0; loops < iterations; loops++) {
+        (void) host_time_ns();
         (void) host_time_ns();
+    }
 
-    /* Mark end time in ns */
-    uint64_t end_ns = host_time_ns();
+    const uint64_t end_ns_2 = host_time_ns();
+    const uint64_t elapsed_ns_2 = end_ns_2 - start_ns_2;
 
     /* Calculate average overhead per call */
-    double ns_per_call = (double) (end_ns - start_ns) / (double) target_loop;
+    const double ns_per_call =
+        (double) (elapsed_ns_2 - elapsed_ns_1) / (double) iterations;
 
     /* 'semu_timer_clocksource' is called ~2e8 times per SMP. Each call's
      * overhead ~ ns_per_call. The total overhead is ~ ns_per_call * SMP * 2e8.
      * That overhead is about 10% of the entire boot, so effectively:
      *   predict_sec = ns_per_call * SMP * 2
      * Then scale_factor = (desired_time) / (predict_sec).
      */
-    double predict_sec = ns_per_call * SEMU_SMP * 2.0;
+    const double predict_sec = ns_per_call * SEMU_SMP * 2.0;
     scale_factor = SEMU_BOOT_TARGET_TIME / predict_sec;
 }
 
@@ -141,12 +156,11 @@ static uint64_t semu_timer_clocksource(semu_timer_t *timer)
     uint64_t now_ns = host_time_ns();
 
     /* real_ticks => (now_ns * freq) / 1e9 */
-    uint64_t real_ticks =
-        mult_frac(now_ns, (double) timer->freq, 1000000000ULL);
+    uint64_t real_ticks = mult_frac(now_ns, (double) timer->freq, 1e9);
 
     /* scaled_ticks => (now_ns * (freq*scale_factor)) / 1e9 */
     uint64_t scaled_ticks =
-        mult_frac(now_ns, (double) (timer->freq * scale_factor), 1000000000ULL);
+        mult_frac(now_ns, (double) (timer->freq * scale_factor), 1e9);
 
     if (!boot_complete)
         return scaled_ticks; /* Return scaled ticks in the boot phase. */

diff --git a/utils.h b/utils.h
@@ -9,7 +9,7 @@ typedef struct {
     uint64_t freq;
 } semu_timer_t;
 
-extern bool boot_complete; /* complete boot process and get in initrd */
+extern bool boot_complete; /* Time to reach the first user process. */
 
 void semu_timer_init(semu_timer_t *timer, uint64_t freq);
 uint64_t semu_timer_get(semu_timer_t *timer);