From 0c72fed993677ba5be6a4f03cf3fbb0ee87a0f86 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 28 Jun 2024 17:57:46 +0900 Subject: [PATCH 1/3] Fix performance degradation of HIP dot The results did not match between cuda-stream and hip-stream on the same NVIDIA GPU card (NVIDIA A100 40GB PCIe) when large arraysize is specified. cuda-stream uses the number of SMs to decide dot_num_blocks, which looks more sensible than to use arraysize to determine the parameter. It is used as kernel grid size and iteration count for reduction in the host code. Link: https://github.com/UoB-HPC/BabelStream/commit/9954b7d38cd85d20927428425a9840a72a56c3e4 Signed-off-by: Daisuke Matsuda --- src/hip/HIPStream.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index ec02425a..13b4195f 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -47,9 +47,12 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) std::cout << "Memory: DEFAULT" << std::endl; #endif + hipDeviceProp_t props; + hipGetDeviceProperties(&props, device_index); + check_error(); + array_size = ARRAY_SIZE; - // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) - dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); + dot_num_blocks = props.multiProcessorCount * 4; size_t array_bytes = sizeof(T); array_bytes *= ARRAY_SIZE; @@ -63,8 +66,6 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) check_error(); // Check buffers fit on the device - hipDeviceProp_t props; - hipGetDeviceProperties(&props, 0); if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); From 580090af04e8d13a4182022e4eb3d2e7dcf0dfc8 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 28 Jun 2024 18:22:24 +0900 Subject: [PATCH 2/3] Delete unused parameter from hip-stream Signed-off-by: Daisuke Matsuda --- src/hip/HIPStream.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 13b4195f..145a7ea3 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -56,7 +56,6 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) size_t array_bytes = sizeof(T); array_bytes *= ARRAY_SIZE; - size_t total_bytes = array_bytes * 3; // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires @@ -66,7 +65,7 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) check_error(); // Check buffers fit on the device - if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < std::size_t{3}*array_bytes) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers From be8f70f2a6cfd03377c0e1ca92caa29a67bad372 Mon Sep 17 00:00:00 2001 From: Daisuke Matsuda Date: Fri, 28 Jun 2024 18:30:00 +0900 Subject: [PATCH 3/3] Delete obsolete definitions from hip-stream header Signed-off-by: Daisuke Matsuda --- src/hip/HIPStream.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 76ef7df4..b437d2b7 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -14,27 +14,10 @@ #include "Stream.h" #define IMPLEMENTATION_STRING "HIP" -#define DOT_READ_DWORDS_PER_LANE 4 - template class HIPStream : public Stream { - // Make sure that either: - // DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element - // or - // DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T) - static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || - (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), - "DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)"); - - // Take into account the datatype size - // That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements - // and 4 FP32 elements - static constexpr unsigned int dot_elements_per_lane{ - (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( - DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; - protected: // Size of arrays intptr_t array_size;