From 7d592d80e783156e76d222ca7996667582c2e9c5 Mon Sep 17 00:00:00 2001 From: taozha2 Date: Mon, 29 Apr 2024 23:28:26 -0700 Subject: [PATCH] update build.sh and result printing. --- build.sh | 28 ++++++++++++++------ examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp | 6 ++++- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/build.sh b/build.sh index 234fcfd6fe..258ce10a23 100644 --- a/build.sh +++ b/build.sh @@ -1,16 +1,28 @@ -sycl_compiler_path=/opt/cutlass_compiler/ -target=./examples/cute/tutorial/pvc_sycl +sycl_compiler_path=/opt/cutlass/compiler/0327/ +gpu_driver_path=/opt/cutlass/gpu_driver/hotfix_agama-ci-devel-803.25/extract/ cuda_path=/usr/local/cuda-12.3/ mkl_path=/opt/intel/oneapi/mkl/2024.1 -rm -rf $target + +# AOT compile +output=intel_gpu_pvc + +# jit compile +#output=spir64 + + export ZE_AFFINITY_MASK=0 export CPATH=$sycl_compiler_path:$sycl_compiler_path/include/:$sycl_compiler_path/include/sycl/:$mkl_path/include/ -export LIBRARY_PATH=/opt/intel/oneapi/mkl/2024.1/lib/ -export LD_LIBRARY_PATH=$mkl_path/lib/:${sycl_compiler_path}/lib/ +export LIBRARY_PATH=$gpu_driver_path/usr/lib/x86_64-linux-gnu/:$mkl_path/lib/:$sycl_compiler_path/lib/ +export LD_LIBRARY_PATH=$LIBRARY_PATH export IGC_EnableVISANoSchedule=1 export IGC_ShaderDumpEnable=1 -export IGC_DumpToCustomDir=./mm_dumps_prefetch_coop +export IGC_DumpToCustomDir=./mm_dumps export IGC_VATemp=1 +export ONEAPI_DEVICE_SELECTOR=level_zero:gpu + +target=./examples/cute/tutorial/pvc_sycl +rm -rf $target + cmake .. -G Ninja -DCMAKE_CUDA_HOST_COMPILER=${sycl_compiler_path}/bin/clang++ -DCMAKE_CUDA_COMPILER=$cuda_path/bin/nvcc \ --DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=intel_gpu_pvc -DCMAKE_CXX_COMPILER=${sycl_compiler_path}/bin/clang++ \ --DCMAKE_CXX_FLAGS=" -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -DPREFETCH_DEFAULT" && ninja -v $target && ONEAPI_DEVICE_SELECTOR=level_zero:gpu $target +-DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=$output -DCMAKE_CXX_COMPILER=${sycl_compiler_path}/bin/clang++ \ +-DCMAKE_CXX_FLAGS=" -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -DPREFETCH_DEFAULT" && ninja -v $target && $target diff --git a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp index 189c8d4746..64f238c52b 100644 --- a/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp +++ b/examples/cute/tutorial/pvc_sycl/pvc_sycl.cpp @@ -269,16 +269,19 @@ void cute_gemm(size_t M, size_t K, size_t N) { double average_event_time = 0.f; auto best = 999.f; for (uint32_t i = WARMUP_ITERATIONS; i < total_iterations; i++) { +#if 0 printf("GPU time is %f ms, Tflops is: %f, HBM (GBs) is %f\n", event_times[i] / 1e3, 2.0 * M * N * K / 1e12 / event_times[i], (M * K * sizeof(dtype_a) + K * N * sizeof(dtype_b) + M * N * sizeof(dtype_c)) / event_times[i] / 1e9); +#endif average_event_time += event_times[i]; best = min(best, event_times[i]); } average_event_time /= testIterations; - printf("Best is %f Tflops, %f HBM (GBs)\n", 2.0 * M * N * K / 1e12 / best, + printf("MKN (%d, %d, %d), Best is %f ms, %f Tflops, %f HBM (GBs)\n", M, K, N, + best * 1e3, 2.0 * M * N * K / 1e12 / best, (M * K * sizeof(dtype_a) + K * N * sizeof(dtype_b) + M * N * sizeof(dtype_c)) / best / 1e9); @@ -305,6 +308,7 @@ void cute_gemm(size_t M, size_t K, size_t N) { } int main(int argc, char **argv) { + // M, K, N cute_gemm<256, 256, 32, 64, 32, 32>(2048, 2048, 2048); cute_gemm<256, 256, 32, 64, 32, 32>(4096, 4096, 4096); cute_gemm<256, 256, 32, 64, 32, 32>(8192, 8192, 8192);