From 3d84a8e77b851fb5367d4755e739596fe993f9a9 Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Tue, 21 May 2024 09:20:43 +0100 Subject: [PATCH] Add sycl equivalent to cuda events for profiling --- .../ampere_tf32_tensorop_gemm_cute.cpp | 7 + examples/cute/tutorial/sgemm_1_sycl.cpp | 8 ++ examples/cute/tutorial/sgemm_2_sycl.cpp | 8 ++ examples/cute/tutorial/sgemm_sm70_sycl.cpp | 8 ++ examples/cute/tutorial/sgemm_sm80_sycl.cpp | 8 ++ examples/sycl/common/example_runner.hpp | 10 ++ .../sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp | 13 -- .../gemm/device/gemm_universal_adapter.h | 9 +- tools/util/include/cutlass/util/GPU_Clock.hpp | 33 +++-- .../include/cutlass/util/event_manager.hpp | 133 ++++++++++++++++++ 10 files changed, 207 insertions(+), 30 deletions(-) create mode 100644 tools/util/include/cutlass/util/event_manager.hpp diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp index 77bae13ca1..4bea284d8f 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp +++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm_cute.cpp @@ -121,6 +121,13 @@ run(Gemm_Op gemm_op) void test_gemm(int m, int n, int k) { + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); std::cout << "M = " << m << std::endl; std::cout << "N = " << n << std::endl; diff --git a/examples/cute/tutorial/sgemm_1_sycl.cpp b/examples/cute/tutorial/sgemm_1_sycl.cpp index 764b1f105e..ab4aa38538 100644 --- a/examples/cute/tutorial/sgemm_1_sycl.cpp +++ b/examples/cute/tutorial/sgemm_1_sycl.cpp @@ -376,6 +376,14 @@ int main(int argc, char** argv) { char transB = 'T'; if (argc >= 6) sscanf(argv[5], "%c", &transB); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + using TA = float; using TB = float; using TC = float; diff --git a/examples/cute/tutorial/sgemm_2_sycl.cpp b/examples/cute/tutorial/sgemm_2_sycl.cpp index bb6d1fe214..3662011bbb 100644 --- a/examples/cute/tutorial/sgemm_2_sycl.cpp +++ b/examples/cute/tutorial/sgemm_2_sycl.cpp @@ -397,6 +397,14 @@ int main(int argc, char** argv) { char transB = 'T'; if (argc >= 6) sscanf(argv[5], "%c", &transB); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + using TA = float; using TB = float; using TC = float; diff --git a/examples/cute/tutorial/sgemm_sm70_sycl.cpp b/examples/cute/tutorial/sgemm_sm70_sycl.cpp index 3ad03e3c9c..18e1cbf2e2 100644 --- a/examples/cute/tutorial/sgemm_sm70_sycl.cpp +++ b/examples/cute/tutorial/sgemm_sm70_sycl.cpp @@ -385,6 +385,14 @@ int main(int argc, char** argv) { char transB = 'T'; if (argc >= 6) sscanf(argv[5], "%c", &transB); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + using TA = float; using TB = float; using TC = float; diff --git a/examples/cute/tutorial/sgemm_sm80_sycl.cpp b/examples/cute/tutorial/sgemm_sm80_sycl.cpp index 1bd288f1c8..f01871d9a5 100644 --- a/examples/cute/tutorial/sgemm_sm80_sycl.cpp +++ b/examples/cute/tutorial/sgemm_sm80_sycl.cpp @@ -468,6 +468,14 @@ int main(int argc, char** argv) { char transB = 'T'; if (argc >= 6) sscanf(argv[5], "%c", &transB); + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); + using TA = float; using TB = float; using TC = float; diff --git a/examples/sycl/common/example_runner.hpp b/examples/sycl/common/example_runner.hpp index f300012caa..767e074616 100644 --- a/examples/sycl/common/example_runner.hpp +++ b/examples/sycl/common/example_runner.hpp @@ -208,6 +208,16 @@ struct ExampleRunner { auto problem_shape_MNKL = cute::append<4>(problem_size, 1); auto [M, N, K, L] = problem_shape_MNKL; +#if defined(CUTLASS_ENABLE_SYCL) + sycl::property_list prop = { + sycl::property::queue::in_order(), + sycl::property::queue::enable_profiling() + }; + + auto q = sycl::queue(syclcompat::get_default_context(), syclcompat::get_current_device(), prop); + syclcompat::set_default_queue(q); +#endif + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp index 0879034f16..23d796a7e5 100644 --- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp +++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp @@ -29,19 +29,6 @@ * **************************************************************************************************/ -#include "cutlass/gemm/device/gemm.h" -#include "cutlass/epilogue/collective/default_epilogue.hpp" -#include "cutlass/gemm/device/gemm_universal.h" -#include "cutlass/gemm/device/gemm_universal_adapter.h" -#include "cutlass/gemm/collective/collective_mma.hpp" -#include "cutlass/util/GPU_Clock.hpp" - -#include - -#include "cutlass/util/device_memory.h" -#include "cutlass/util/packed_stride.hpp" -#include "cutlass/util/reference/device/gemm_complex.h" - #include "../common/example_runner.hpp" using namespace cute; diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h index 680dd5d4ce..979252fbcb 100644 --- a/include/cutlass/gemm/device/gemm_universal_adapter.h +++ b/include/cutlass/gemm/device/gemm_universal_adapter.h @@ -58,6 +58,10 @@ // 3.x #include "cutlass/gemm/kernel/gemm_universal.hpp" +#if defined(CUTLASS_ENABLE_SYCL) +#include "cutlass/util/event_manager.hpp" +#endif + //////////////////////////////////////////////////////////////////////////////// namespace cutlass::gemm::device { @@ -403,10 +407,11 @@ class GemmUniversalAdapter< const auto sycl_grid = syclcompat::dim3(grid.x, grid.y, grid.z); #if defined (SYCL_INTEL_TARGET) - syclcompat::experimental::launch, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params); + auto event = syclcompat::experimental::launch, DispatchPolicy::SubgroupSize>(sycl_grid, sycl_block, smem_size, params); #else - syclcompat::launch>(sycl_grid, sycl_block, smem_size, params); + auto event = syclcompat::launch>(sycl_grid, sycl_block, smem_size, params); #endif + EventManager::getInstance().addEvent(event); #else device_kernel<<>>(params); #endif diff --git a/tools/util/include/cutlass/util/GPU_Clock.hpp b/tools/util/include/cutlass/util/GPU_Clock.hpp index bd017a777a..f760ead1e7 100644 --- a/tools/util/include/cutlass/util/GPU_Clock.hpp +++ b/tools/util/include/cutlass/util/GPU_Clock.hpp @@ -32,31 +32,37 @@ #pragma once #if defined(CUTLASS_ENABLE_SYCL) -#include -#include +#include "cutlass/util/event_manager.hpp" #else #include #endif struct GPU_Clock { -#if !defined(CUTLASS_ENABLE_SYCL) GPU_Clock() { +#if defined(CUTLASS_ENABLE_SYCL) + start_ = SyclEvent{}; + stop_ = SyclEvent{}; +#else cudaEventCreate(&start_); cudaEventCreate(&stop_); cudaEventRecord(start_); +#endif } ~GPU_Clock() { +#if defined(CUTLASS_ENABLE_SYCL) + syclEventDestroy(start_); + syclEventDestroy(stop_); +#else cudaEventDestroy(start_); cudaEventDestroy(stop_); - } #endif + } void start() { #if defined(CUTLASS_ENABLE_SYCL) - syclcompat::get_default_queue().wait(); - start_ = std::chrono::high_resolution_clock::now(); + syclEventRecord(start_); #else cudaEventRecord(start_); #endif @@ -64,10 +70,11 @@ struct GPU_Clock float milliseconds() { #if defined(CUTLASS_ENABLE_SYCL) - syclcompat::get_default_queue().wait(); - auto stop = std::chrono::high_resolution_clock::now(); - std::chrono::duration time = stop - start_; - return time.count(); + syclEventRecord(stop_); + syclEventSynchronize(start_, stop_); + float time; + syclEventElapsedTime(&time, start_, stop_); + return time; #else cudaEventRecord(stop_); cudaEventSynchronize(stop_); @@ -83,11 +90,7 @@ struct GPU_Clock private: #if defined(CUTLASS_ENABLE_SYCL) - typedef std::chrono::nanoseconds duration; - typedef std::chrono::high_resolution_clock high_resolution_clock; - typedef std::chrono::time_point time_point; - - time_point start_ = std::chrono::high_resolution_clock::now(); + SyclEvent start_, stop_; #else cudaEvent_t start_, stop_; #endif diff --git a/tools/util/include/cutlass/util/event_manager.hpp b/tools/util/include/cutlass/util/event_manager.hpp new file mode 100644 index 0000000000..99006c693f --- /dev/null +++ b/tools/util/include/cutlass/util/event_manager.hpp @@ -0,0 +1,133 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include + +class SyclEvent { +private: + int index; + +public: + SyclEvent() : index(-1) { + }; + + int getIndex() const { + return index; + } + + SyclEvent& operator=(int const& value) { + index = value; + return *this; + }; +}; + +class EventManager { +public: + static EventManager& getInstance() + { + static EventManager instance; + return instance; + } +private: + EventManager() {} + std::vector events{}; + int recorders = 0; + +public: + EventManager(EventManager const&) = delete; + void operator=(EventManager const&) = delete; + + void startRecording(SyclEvent &event) { + if (event.getIndex() != -1) { + throw std::runtime_error("Event is already being recorded."); + } + recorders++; + event = static_cast(events.size()); + } + + void addEvent(const sycl::event &event) { + events.push_back(event); + } + + void eventDestroy() { + recorders--; + if (!recorders) { + events.clear(); + } + } + + float getEventElapsedTimeMs(SyclEvent const& begin, SyclEvent const& end) { + if (begin.getIndex() < 0 || begin.getIndex() > end.getIndex() || end.getIndex() > events.size()) { + throw std::runtime_error("Index out of bounds"); + } + + auto time_event = 0.0; + for (int i = begin.getIndex(); i < end.getIndex(); ++i) { + auto start_time = events[i].template get_profiling_info< + sycl::info::event_profiling::command_start>(); + + auto end_time = events[i].template get_profiling_info< + sycl::info::event_profiling::command_end>(); + + time_event += static_cast(end_time - start_time); + } + return time_event * 1e-6; + } + + void wait(SyclEvent const& begin, SyclEvent const& end) { + if (begin.getIndex() < 0 || begin.getIndex() > end.getIndex() || end.getIndex() > events.size()) { + throw std::runtime_error("Index out of bounds"); + } + + for (int i = begin.getIndex(); i < end.getIndex(); ++i) { + events[i].wait(); + } + } + +}; + +void syclEventDestroy(SyclEvent const& event) { + EventManager::getInstance().eventDestroy(); +} + +void syclEventRecord(SyclEvent &event) { + EventManager::getInstance().startRecording(event); +} + +void syclEventSynchronize(SyclEvent const& begin, SyclEvent const& end) { + EventManager::getInstance().wait(begin, end); +} + +void syclEventElapsedTime(float* time, SyclEvent const& begin, SyclEvent const& end) { + *time = EventManager::getInstance().getEventElapsedTimeMs(begin, end); +} \ No newline at end of file