sysprog21 · Mes0903 · Jan 15, 2025 · Jan 17, 2025 · ranvd · Jan 12, 2025
diff --git a/Makefile b/Makefile
@@ -56,6 +56,8 @@ OBJS := \
 	aclint.o \
 	$(OBJS_EXTRA)
 
+LDFLAGS := -pg
+
 deps := $(OBJS:%.o=.%.o.d)
 
 $(BIN): $(OBJS)
@@ -78,6 +80,8 @@ E :=
 S := $E $E
 
 SMP ?= 1
+CFLAGS += -D SEMU_SMP=$(SMP)
+CFLAGS += -D SEMU_BOOT_TARGET_TIME=10
 .PHONY: riscv-harts.dtsi
 riscv-harts.dtsi:
 	$(Q)python3 scripts/gen-hart-dts.py $@ $(SMP) $(CLOCK_FREQ)

diff --git a/auto_test.sh b/auto_test.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Create a directory to store logs (optional)
+mkdir -p logs
+
+for N in $(seq 1 32); do
+    echo "============================================="
+    echo "Starting experiment with SMP=$N"
+    echo "============================================="
+
+    # 1) Clean
+    make clean
+
+    # 2) Build and run checks with SMP=N, capturing emulator output
+    #    The 'tee' command copies output to the terminal AND a log file
+    echo "Building and running 'make check SMP=$N'..."
+    make check SMP=$N 2>&1 | tee "logs/emulator_SMP_${N}.log"
+
+    # 3) After the emulator run, record gprof output
+    #    We assume 'gprof ./semu' uses data from 'gmon.out'
+    echo "Running gprof for SMP=$N..."
+    gprof ./semu > "logs/gprof_SMP_${N}.log" 2>&1
+
+    echo "Done with SMP=$N. Logs saved:"
+    echo "  - logs/emulator_SMP_${N}.log"
+    echo "  - logs/gprof_SMP_${N}.log"
+    echo
+done
+
+echo "All experiments complete!"
diff --git a/riscv.c b/riscv.c
@@ -382,6 +382,14 @@ static void op_sret(hart_t *vm)
     vm->s_mode = vm->sstatus_spp;
     vm->sstatus_sie = vm->sstatus_spie;
 
+    /* After the booting process is complete, initrd will be loaded. At this
+     * point, the sytstem will switch to U mode for the first time. Therefore,
+     * by checking whether the switch to U mode has already occurred, we can
+     * determine if the boot process has been completed.
+     */
+    if (!boot_complete && !vm->s_mode)
+        boot_complete = true;
+
     /* Reset stack */
     vm->sstatus_spp = false;
     vm->sstatus_spie = true;

diff --git a/utils.c b/utils.c
@@ -1,3 +1,5 @@
+#include <stdio.h>
+#include <stdlib.h>
 #include <time.h>
 
 #include "utils.h"
@@ -19,6 +21,14 @@
 #endif
 #endif
 
+bool boot_complete = false;
+static double scale_factor;
+
+/* for testing */
+uint64_t count = 0;
+struct timespec boot_begin, boot_end;
+double TEST_ns_per_call, TEST_predict_sec;
+
 /* Calculate "x * n / d" without unnecessary overflow or loss of precision.
  *
  * Reference:
@@ -32,35 +42,199 @@ static inline uint64_t mult_frac(uint64_t x, uint64_t n, uint64_t d)
     return q * n + r * n / d;
 }
 
-void semu_timer_init(semu_timer_t *timer, uint64_t freq)
+/* On POSIX => use clock_gettime().
+ * On macOS => use mach_absolute_time().
+ * Else => fallback to time(0) in seconds, convert to ns.
+ *
+ * Now, the POSIX/macOS logic can be clearly reused. Meanwhile, the fallback
+ * path might just do a coarser approach with time(0).
+ */
+static inline uint64_t host_time_ns()
 {
-    timer->freq = freq;
-    semu_timer_rebase(timer, 0);
+#if defined(HAVE_POSIX_TIMER)
+    struct timespec ts;
+    clock_gettime(CLOCKID, &ts);
+    return (uint64_t) ts.tv_sec * 1e9 + (uint64_t) ts.tv_nsec;
+
+#elif defined(HAVE_MACH_TIMER)
+    static mach_timebase_info_data_t ts = {0};
+    if (ts.denom == 0)
+        (void) mach_timebase_info(&ts);
+
+    uint64_t now = mach_absolute_time();
+    /* convert to nanoseconds: (now * t.numer / t.denom) */
+    return mult_frac(now, ts.numer, (uint64_t) ts.denom);
+
+#else
+    /* Minimal fallback: time(0) in seconds => convert to ns. */
+    time_t now_sec = time(0);
+    return (uint64_t) now_sec * 1e9;
+#endif
 }
 
-static uint64_t semu_timer_clocksource(uint64_t freq)
+/* Measure the overhead of a high-resolution timer call, typically
+ * 'clock_gettime()' on POSIX or 'mach_absolute_time()' on macOS.
+ *
+ * 1) Times how long it takes to call 'host_time_ns()' repeatedly (iterations).
+ * 2) Derives an average overhead per call => ns_per_call.
+ * 3) Because semu_timer_clocksource is ~10% of boot overhead, and called ~2e8
+ *    times * SMP, we get predict_sec = ns_per_call * SMP * 2. Then set
+ *    'scale_factor' so the entire boot completes in SEMU_BOOT_TARGET_TIME
+ *    seconds.
+ */
+static void measure_bogomips_ns(uint64_t iterations)
 {
-#if defined(HAVE_POSIX_TIMER)
-    struct timespec t;
-    clock_gettime(CLOCKID, &t);
-    return t.tv_sec * freq + mult_frac(t.tv_nsec, freq, 1e9);
+    /* Perform 'iterations' times calling the host HRT.
+     *
+     *
+     * Assuming the cost of loop overhead is 'e' and the cost of 'host_time_ns'
+     * is 't', we perform a two-stage measurement to eliminate the loop
+     * overhead. In the first loop, 'host_time_ns' is called only once per
+     * iteration, while in the second loop, it is called twice per iteration.
+     *
+     * In this way, the cost of the first loop is 'e + t', and the cost of the
+     * second loop is 'e + 2t'. By subtracting the two, we can effectively
+     * eliminate the loop overhead.
+     *
+     * Reference:
+     * https://ates.dev/posts/2025-01-12-accurate-benchmarking/
+     */
+    const uint64_t start_ns_1 = host_time_ns();
+    for (uint64_t loops = 0; loops < iterations; loops++)
+        (void) host_time_ns();
+
+    const uint64_t end_ns_1 = host_time_ns();
+    const uint64_t elapsed_ns_1 = end_ns_1 - start_ns_1;
+
+    /* Second measurement */
+    const uint64_t start_ns_2 = host_time_ns();
+    for (uint64_t loops = 0; loops < iterations; loops++) {
+        (void) host_time_ns();
+        (void) host_time_ns();
+    }
+
+    const uint64_t end_ns_2 = host_time_ns();
+    const uint64_t elapsed_ns_2 = end_ns_2 - start_ns_2;
+
+    /* Calculate average overhead per call */
+    const double ns_per_call =
+        (double) (elapsed_ns_2 - elapsed_ns_1) / (double) iterations;
+
+    /* 'semu_timer_clocksource' is called ~2e8 times per SMP. Each call's
+     * overhead ~ ns_per_call. The total overhead is ~ ns_per_call * SMP *
+     * 2e8. That overhead is about 10% of the entire boot, so effectively:
+     *   predict_sec = ns_per_call * SMP * 2e8 * (100%/10%) / 1e9
+     *               = ns_per_call * SMP * 2.0
+     * Then scale_factor = (desired_time) / (predict_sec).
+     */
+    const double predict_sec = ns_per_call * SEMU_SMP * 2.0;
+    scale_factor = SEMU_BOOT_TARGET_TIME / predict_sec;
+
+    /* for testing */
+    TEST_ns_per_call = ns_per_call;
+    TEST_predict_sec = predict_sec;
+}
+
+/* The main function that returns the "emulated time" in ticks.
+ *
+ * Before the boot completes, we scale time by 'scale_factor' for a "fake
+ * increments" approach. After boot completes, we switch to real time
+ * with an offset bridging so that there's no big jump.
+ */
+static uint64_t semu_timer_clocksource(semu_timer_t *timer)
+{
+    count++;
+
+    /* After boot process complete, the timer will switch to real time. Thus,
+     * there is an offset between the real time and the emulator time.
+     *
+     * After switching to real time, the correct way to update time is to
+     * calculate the increment of time. Then add it to the emulator time.
+     */
+    static int64_t offset = 0;
+    static bool first_switch = true;
+
+#if defined(HAVE_POSIX_TIMER) || defined(HAVE_MACH_TIMER)
+    uint64_t now_ns = host_time_ns();
+
+    /* real_ticks = (now_ns * freq) / 1e9 */
+    uint64_t real_ticks = mult_frac(now_ns, timer->freq, 1e9);
+
+    /* scaled_ticks = (now_ns * (freq*scale_factor)) / 1e9
+     *              = ((now_ns * freq) / 1e9) * scale_factor
+     */
+    uint64_t scaled_ticks = real_ticks * scale_factor;
+
+    if (!boot_complete)
+        return scaled_ticks; /* Return scaled ticks in the boot phase. */
+
+    /* The boot is done => switch to real freq with an offset bridging. */
+    if (first_switch) {
+        clock_gettime(CLOCKID, &boot_end);
+        double boot_time = (boot_end.tv_sec - boot_begin.tv_sec) +
+                           (boot_end.tv_nsec - boot_begin.tv_nsec) / 1e9;
+
+        first_switch = false;
+        offset = (int64_t) (real_ticks - scaled_ticks);
+        printf(
+            "\033[1;31m[SEMU LOG]: Boot time: %.5f seconds, called %ld "
+            "times semu_timer_total_ticks\033[0m\n",
+            boot_time, count);
+
+        printf(
+            "\033[1;31m[SEMU LOG]: ns_per_call = %.5f, predict_sec = %.5f, "
+            "scale_factor = %.5f\033[0m\n",
+            TEST_ns_per_call, TEST_predict_sec, scale_factor);
+
+        exit(0);
+    }
+    return (uint64_t) ((int64_t) real_ticks - offset);
+
 #elif defined(HAVE_MACH_TIMER)
-    static mach_timebase_info_data_t t;
-    if (t.denom == 0)
-        (void) mach_timebase_info(&t);
-    return mult_frac(mult_frac(mach_absolute_time(), t.numer, t.denom), freq,
-                     1e9);
-#else
-    return time(0) * freq;
+    /* Because we don't rely on sub-second calls to 'host_time_ns()' here,
+     * we directly use time(0). This means the time resolution is coarse (1
+     * second), but the logic is the same: we do a scaled approach pre-boot,
+     * then real freq with an offset post-boot.
+     */
+    time_t now_sec = time(0);
+
+    /* Before boot done, scale time. */
+    if (!boot_complete)
+        return (uint64_t) now_sec * (uint64_t) (timer->freq * scale_factor);
+
+    if (first_switch) {
+        first_switch = false;
+        uint64_t real_val = (uint64_t) now_sec * (uint64_t) timer->freq;
+        uint64_t scaled_val =
+            (uint64_t) now_sec * (uint64_t) (timer->freq * scale_factor);
+        offset = (int64_t) (real_val - scaled_val);
+    }
+
+    /* Return real freq minus offset. */
+    uint64_t real_freq_val = (uint64_t) now_sec * (uint64_t) timer->freq;
+    return real_freq_val - offset;
 #endif
 }
 
+void semu_timer_init(semu_timer_t *timer, uint64_t freq)
+{
+    /* Measure how long each call to 'host_time_ns()' roughly takes,
+     * then use that to pick 'scale_factor'. For example, pass freq
+     * as the loop count or some large number to get a stable measure.
+     */
+    measure_bogomips_ns(freq);
+
+    clock_gettime(CLOCKID, &boot_begin);
+    timer->freq = freq;
+    semu_timer_rebase(timer, 0);
+}
+
 uint64_t semu_timer_get(semu_timer_t *timer)
 {
-    return semu_timer_clocksource(timer->freq) - timer->begin;
+    return semu_timer_clocksource(timer) - timer->begin;
 }
 
 void semu_timer_rebase(semu_timer_t *timer, uint64_t time)
 {
-    timer->begin = semu_timer_clocksource(timer->freq) - time;
+    timer->begin = semu_timer_clocksource(timer) - time;
 }
diff --git a/utils.h b/utils.h
@@ -1,7 +1,10 @@
 #pragma once
 
+#include <stdbool.h>
 #include <stdint.h>
 
+extern bool boot_complete; /* Time to reach the first user process. */
+
 /* TIMER */
 typedef struct {
     uint64_t begin;