From 7d592d80e783156e76d222ca7996667582c2e9c5 Mon Sep 17 00:00:00 2001
From: taozha2 <tao2.zhang@intel.com>
Date: Mon, 29 Apr 2024 23:28:26 -0700
Subject: [PATCH] update build.sh and result printing.

---
 build.sh                                     | 28 ++++++++++++++------
 examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp |  6 ++++-
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/build.sh b/build.sh
index 234fcfd6fe..258ce10a23 100644
--- a/build.sh
+++ b/build.sh
@@ -1,16 +1,28 @@
-sycl_compiler_path=/opt/cutlass_compiler/
-target=./examples/cute/tutorial/pvc_sycl
+sycl_compiler_path=/opt/cutlass/compiler/0327/
+gpu_driver_path=/opt/cutlass/gpu_driver/hotfix_agama-ci-devel-803.25/extract/
 cuda_path=/usr/local/cuda-12.3/
 mkl_path=/opt/intel/oneapi/mkl/2024.1
-rm -rf $target
+
+# AOT compile
+output=intel_gpu_pvc
+
+# jit compile
+#output=spir64
+
+
 export ZE_AFFINITY_MASK=0
 export CPATH=$sycl_compiler_path:$sycl_compiler_path/include/:$sycl_compiler_path/include/sycl/:$mkl_path/include/
-export LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.1/lib/
-export LD_LIBRARY_PATH=$mkl_path/lib/:${sycl_compiler_path}/lib/
+export LIBRARY_PATH=$gpu_driver_path/usr/lib/x86_64-linux-gnu/:$mkl_path/lib/:$sycl_compiler_path/lib/
+export LD_LIBRARY_PATH=$LIBRARY_PATH
 export IGC_EnableVISANoSchedule=1
 export IGC_ShaderDumpEnable=1
-export IGC_DumpToCustomDir=./mm_dumps_prefetch_coop
+export IGC_DumpToCustomDir=./mm_dumps
 export IGC_VATemp=1
+export ONEAPI_DEVICE_SELECTOR=level_zero:gpu
+
+target=./examples/cute/tutorial/pvc_sycl
+rm -rf $target
+
 cmake .. -G Ninja -DCMAKE_CUDA_HOST_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CUDA_COMPILER=$cuda_path/bin/nvcc \
--DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=intel_gpu_pvc -DCMAKE_CXX_COMPILER=${sycl_compiler_path}/bin/clang++ \
--DCMAKE_CXX_FLAGS=" -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -DPREFETCH_DEFAULT" && ninja -v $target && ONEAPI_DEVICE_SELECTOR=level_zero:gpu $target
+-DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=$output -DCMAKE_CXX_COMPILER=${sycl_compiler_path}/bin/clang++ \
+-DCMAKE_CXX_FLAGS=" -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -DPREFETCH_DEFAULT" && ninja -v $target && $target
diff --git a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp
index 189c8d4746..64f238c52b 100644
--- a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp
+++ b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp
@@ -269,16 +269,19 @@ void cute_gemm(size_t M, size_t K, size_t N) {
   double average_event_time = 0.f;
   auto best = 999.f;
   for (uint32_t i = WARMUP_ITERATIONS; i < total_iterations; i++) {
+#if 0
     printf("GPU time is %f ms, Tflops is: %f, HBM (GBs) is %f\n",
            event_times[i] / 1e3, 2.0 * M * N * K / 1e12 / event_times[i],
            (M * K * sizeof(dtype_a) + K * N * sizeof(dtype_b) +
             M * N * sizeof(dtype_c)) /
                event_times[i] / 1e9);
+#endif
     average_event_time += event_times[i];
     best = min(best, event_times[i]);
   }
   average_event_time /= testIterations;
-  printf("Best is %f Tflops, %f HBM (GBs)\n", 2.0 * M * N * K / 1e12 / best,
+  printf("MKN (%d, %d, %d), Best is %f ms, %f Tflops, %f HBM (GBs)\n", M, K, N,
+         best * 1e3, 2.0 * M * N * K / 1e12 / best,
          (M * K * sizeof(dtype_a) + K * N * sizeof(dtype_b) +
           M * N * sizeof(dtype_c)) /
              best / 1e9);
@@ -305,6 +308,7 @@ void cute_gemm(size_t M, size_t K, size_t N) {
 }
 
 int main(int argc, char **argv) {
+  // M, K, N
   cute_gemm<256, 256, 32, 64, 32, 32>(2048, 2048, 2048);
   cute_gemm<256, 256, 32, 64, 32, 32>(4096, 4096, 4096);
   cute_gemm<256, 256, 32, 64, 32, 32>(8192, 8192, 8192);