Skip to content

Commit

Permalink
Merge branch 'main' into example_heat_equation_additional_comments
Browse files Browse the repository at this point in the history
  • Loading branch information
alex-m-h committed Dec 1, 2023
2 parents 25d9f09 + 6b31005 commit 9d7de3d
Show file tree
Hide file tree
Showing 70 changed files with 6,398 additions and 198 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ jobs:
-DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-DCMAKE_CXX_FLAGS="\
-Wall -Wextra -Wpedantic \
-Wall -Wextra -Wpedantic -Wno-sign-compare \
-Werror=vla \
-Werror=implicit-fallthrough \
${CMAKE_CXX_FLAGS}" \
Expand Down
38 changes: 9 additions & 29 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,13 @@ project(DDC VERSION 0.0.0 LANGUAGES CXX)

# List of options

option(BUILD_FFT_KERNEL "Build DDC kernel for FFT" ON)
option(BUILD_SPLINES_KERNEL "Build DDC kernel for splines" ON)
option(BUILD_BENCHMARKS "Build DDC benchmarks." OFF)
option(BUILD_DOCUMENTATION "Build DDC documentation/website" OFF)
option(BUILD_EXAMPLES "Build DDC examples" ON)
option(DDC_BUILD_PDI_WRAPPER "Build DDC PDI wrapper" ON)
option(DDC_ENABLE_DOUBLE "Build DDC with double precision support, float is used otherwise" ON)
option(HIP_FOR_NVIDIA "Use the HIP wrapper for CUDA on NVIDIA plateforms, for development purpose" OFF)
if(NOT(Kokkos_ENABLE_CUDA) AND HIP_FOR_NVIDIA)
message(FATAL_ERROR "Kokkos_ENABLE_CUDA has to be ON to use HIP wrapper on NVIDIA plateforms")
endif()

# Dependencies

Expand Down Expand Up @@ -94,12 +92,12 @@ endif()

# FFTW
list( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" ) # Maybe not specific to FFTW
if(NOT FFTW_FOUND)
if("${BUILD_FFT_KERNEL}" AND NOT FFTW_FOUND)
find_package( FFTW MODULE REQUIRED )
endif()

## CUDA + CUDAToolkit
if("${Kokkos_ENABLE_CUDA}" AND NOT("${HIP_FOR_NVIDIA}"))
if("${BUILD_FFT_KERNEL}" AND "${Kokkos_ENABLE_CUDA}")
find_package( CUDAToolkit MODULE REQUIRED )
if( NOT(CUDAToolkit_FOUND) )
message(FATAL_ERROR "CUDAToolkit not found." )
Expand Down Expand Up @@ -237,31 +235,13 @@ if("${Kokkos_ENABLE_HIP}")
target_compile_definitions(DDC INTERFACE hipfft_AVAIL)
endif()

if("${HIP_FOR_NVIDIA}")
find_package( HIP REQUIRED )

target_include_directories(DDC
SYSTEM INTERFACE
"$<BUILD_INTERFACE:${HIP_ROOT_DIR}/include>"
)
target_compile_definitions(DDC INTERFACE hip_AVAIL)
target_compile_definitions(DDC INTERFACE HIP_FOR_NVIDIA)

if( DEFINED HIP_TOOLKIT_PATH ) # Usually called ROCM_PATH in the HIP documentation. By default : /opt/rocm
list( APPEND CMAKE_PREFIX_PATH ${HIP_TOOLKIT_PATH}/hipfft/lib )
find_library(HIPFFT_LIB hipfft REQUIRED)
target_link_libraries( DDC INTERFACE ${HIPFFT_LIB} )
target_include_directories(DDC
SYSTEM INTERFACE
"$<BUILD_INTERFACE:${HIP_TOOLKIT_PATH}/include>"
)
target_compile_definitions(DDC INTERFACE hipfft_AVAIL)
else()
message( "HIP_TOOLKIT_PATH is not defined. Kernels functions may be unaccessible. To get them, add -DHIP_TOOLKIT_PATH=\"path_to_hip_toolkit\" in your cmake line" )
endif()
if("${BUILD_SPLINES_KERNEL}")
# Ginkgo
find_package(Ginkgo 1.6.0 EXACT REQUIRED)
target_link_libraries(DDC INTERFACE Ginkgo::ginkgo)
target_compile_definitions(DDC INTERFACE ginkgo_AVAIL)
endif()


## The PDI wrapper

if("${DDC_BUILD_PDI_WRAPPER}")
Expand Down
13 changes: 11 additions & 2 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
# SPDX-License-Identifier: MIT

add_executable(ddc_benchmarks deepcopy.cpp)
target_link_libraries(ddc_benchmarks
add_executable(ddc_benchmark_deepcopy deepcopy.cpp)
target_link_libraries(ddc_benchmark_deepcopy
PUBLIC
benchmark::benchmark
DDC::DDC
)

if("${BUILD_SPLINES_KERNEL}")
add_executable(ddc_benchmark_splines splines.cpp)
target_link_libraries(ddc_benchmark_splines
PUBLIC
benchmark::benchmark
DDC::DDC
)
endif()
245 changes: 245 additions & 0 deletions benchmarks/splines.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
// SPDX-License-Identifier: MIT
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <iosfwd>
#include <vector>

#include <ddc/ddc.hpp>
#include <ddc/kernels/splines.hpp>

#include <benchmark/benchmark.h>

namespace {

static constexpr std::size_t s_degree_x = 3;

struct X
{
static constexpr bool PERIODIC = true;
};

using BSplinesX = ddc::UniformBSplines<X, s_degree_x>;
using GrevillePoints = ddc::
GrevilleInterpolationPoints<BSplinesX, ddc::BoundCond::PERIODIC, ddc::BoundCond::PERIODIC>;
using DDimX = GrevillePoints::interpolation_mesh_type;

struct Y;
using DDimY = ddc::UniformPointSampling<Y>;


} // namespace

// Function to monitor GPU memory asynchronously
void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem)
{
size_t freeMem = 0;
size_t totalMem = 0;
while (monitorFlag) {
std::this_thread::sleep_for(std::chrono::milliseconds(1)); // Adjust the interval as needed

// Acquire a lock to ensure thread safety when accessing CUDA functions
std::lock_guard<std::mutex> lock(mutex);

#if defined(__CUDACC__)
cudaMemGetInfo(&freeMem, &totalMem);
#endif
maxUsedMem = std::max(maxUsedMem, totalMem - freeMem);
}
}

static void characteristics_advection(benchmark::State& state)
{
size_t freeMem = 0;
size_t totalMem = 0;
#if defined(__CUDACC__)
cudaMemGetInfo(&freeMem, &totalMem);
#endif
size_t initUsedMem
= totalMem
- freeMem; // cudaMemGetInfo gives GPU total memory occupation, we consider that other users of the GPU have constant occupancy and substract it.
size_t maxUsedMem = initUsedMem;

bool monitorFlag = true;
std::mutex mutex;
// Create a thread to monitor GPU memory asynchronously
std::thread monitorThread(
monitorMemoryAsync,
std::ref(mutex),
std::ref(monitorFlag),
std::ref(maxUsedMem));

ddc::init_discrete_space<
BSplinesX>(ddc::Coordinate<X>(-1.), ddc::Coordinate<X>(1.), state.range(0));
ddc::init_discrete_space<DDimX>(ddc::GrevilleInterpolationPoints<
BSplinesX,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>::get_sampling());
ddc::DiscreteDomain<DDimY> y_domain
= ddc::init_discrete_space(DDimY::
init(ddc::Coordinate<Y>(-1.),
ddc::Coordinate<Y>(1.),
ddc::DiscreteVector<DDimY>(state.range(1))));

auto const x_domain = ddc::GrevilleInterpolationPoints<
BSplinesX,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>::get_domain();
ddc::Chunk density_alloc(
ddc::DiscreteDomain<DDimX, DDimY>(x_domain, y_domain),
ddc::DeviceAllocator<double>());
ddc::ChunkSpan const density = density_alloc.span_view();
// Initialize the density on the main domain
ddc::DiscreteDomain<DDimX, DDimY> x_mesh
= ddc::DiscreteDomain<DDimX, DDimY>(x_domain, y_domain);
ddc::for_each(
ddc::policies::parallel_device,
x_mesh,
DDC_LAMBDA(ddc::DiscreteElement<DDimX, DDimY> const ixy) {
double const x = ddc::coordinate(ddc::select<DDimX>(ixy));
double const y = ddc::coordinate(ddc::select<DDimY>(ixy));
density(ixy) = 9.999 * Kokkos::exp(-(x * x + y * y) / 0.1 / 2);
// initial_density(ixy) = 9.999 * ((x * x + y * y) < 0.25);
});
ddc::SplineBuilderBatched<
ddc::SplineBuilder<
Kokkos::DefaultExecutionSpace,
Kokkos::DefaultExecutionSpace::memory_space,
BSplinesX,
DDimX,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>,
DDimX,
DDimY>
spline_builder(x_mesh, state.range(2), state.range(3), state.range(4));
ddc::SplineEvaluatorBatched<
ddc::SplineEvaluator<
Kokkos::DefaultExecutionSpace,
Kokkos::DefaultExecutionSpace::memory_space,
BSplinesX,
DDimX>,
DDimX,
DDimY>
spline_evaluator(
spline_builder.spline_domain(),
ddc::g_null_boundary<BSplinesX>,
ddc::g_null_boundary<BSplinesX>);
ddc::Chunk coef_alloc(
spline_builder.spline_domain(),
ddc::KokkosAllocator<double, Kokkos::DefaultExecutionSpace::memory_space>());
ddc::ChunkSpan coef = coef_alloc.span_view();
ddc::Chunk feet_coords_alloc(
spline_builder.vals_domain(),
ddc::KokkosAllocator<
ddc::Coordinate<X, Y>,
Kokkos::DefaultExecutionSpace::memory_space>());
ddc::ChunkSpan feet_coords = feet_coords_alloc.span_view();

for (auto _ : state) {
Kokkos::Profiling::pushRegion("FeetCharacteristics");
ddc::for_each(
ddc::policies::parallel_device,
feet_coords.domain(),
DDC_LAMBDA(ddc::DiscreteElement<DDimX, DDimY> const e) {
feet_coords(e) = ddc::Coordinate<X, Y>(
ddc::coordinate(ddc::select<DDimX>(e))
- ddc::Coordinate<X>(0.0176429863),
ddc::coordinate(ddc::select<DDimY>(e)));
});
Kokkos::Profiling::popRegion();
Kokkos::Profiling::pushRegion("SplineBuilder");
spline_builder(coef, density);
Kokkos::Profiling::popRegion();
Kokkos::Profiling::pushRegion("SplineEvaluator");
spline_evaluator(density, feet_coords.span_cview(), coef.span_cview());
Kokkos::Profiling::popRegion();
}
monitorFlag = false;
monitorThread.join();
state.SetBytesProcessed(
int64_t(state.iterations())
* int64_t(state.range(0) * state.range(1) * sizeof(double)));
state.counters["gpu_mem_occupancy"] = maxUsedMem - initUsedMem;
////////////////////////////////////////////////////
/// --------------- HUGE WARNING --------------- ///
/// The following lines are forbidden in a prod- ///
/// uction code. It is a necessary workaround ///
/// which must be used ONLY for Google Benchmark.///
/// The reason is it acts on underlying global ///
/// variables, which is always a bad idea. ///
////////////////////////////////////////////////////
ddc::detail::g_discrete_space_dual<BSplinesX>.reset();
ddc::detail::g_discrete_space_dual<BSplinesX::mesh_type>.reset();
ddc::detail::g_discrete_space_dual<DDimX>.reset();
ddc::detail::g_discrete_space_dual<DDimY>.reset();
////////////////////////////////////////////////////
}

// Tuning : 512 cols and 8 precond on CPU, 16384 cols and 1 precond on GPU

#ifdef KOKKOS_ENABLE_CUDA
std::string chip = "gpu";
int cols_per_par_chunk_ref = 65535;
int par_chunks_per_seq_chunk_ref = 1;
unsigned int preconditionner_max_block_size_ref = 1u;
#elif defined(KOKKOS_ENABLE_OPENMP)
std::string chip = "cpu";
int cols_per_par_chunk_ref = 256;
int par_chunks_per_seq_chunk_ref = Kokkos::OpenMP().concurrency();
unsigned int preconditionner_max_block_size_ref = 32u;
#elif defined(KOKKOS_ENABLE_SERIAL)
std::string chip = "cpu";
int cols_per_par_chunk_ref = 256;
int par_chunks_per_seq_chunk_ref = 1;
unsigned int preconditionner_max_block_size_ref = 32u;
#endif

BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges(
{{64, 1024},
{100, 500000},
{cols_per_par_chunk_ref, cols_per_par_chunk_ref},
{par_chunks_per_seq_chunk_ref, par_chunks_per_seq_chunk_ref},
{preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)->UseRealTime();
/*
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges({{64, 1024}, {100000, 100000}, {64,65535}, {par_chunks_per_seq_chunk_ref, par_chunks_per_seq_chunk_ref}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)->UseRealTime();
*/
/*
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges({{64, 1024}, {100000, 100000}, {cols_per_par_chunk_ref, cols_per_par_chunk_ref}, {1, 10000}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)->UseRealTime();
*/
/*
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges({{64, 1024}, {100000, 100000}, {cols_per_par_chunk_ref, cols_per_par_chunk_ref}, {par_chunks_per_seq_chunk_ref, par_chunks_per_seq_chunk_ref}, {1, 32}})
->MinTime(3)->UseRealTime();
*/

int main(int argc, char** argv)
{
::benchmark::Initialize(&argc, argv);
::benchmark::AddCustomContext("chip", chip);
::benchmark::AddCustomContext("cols_per_par_chunk_ref", std::to_string(cols_per_par_chunk_ref));
::benchmark::AddCustomContext(
"par_chunks_per_seq_chunk_ref",
std::to_string(par_chunks_per_seq_chunk_ref));
::benchmark::AddCustomContext(
"preconditionner_max_block_size_ref",
std::to_string(preconditionner_max_block_size_ref));
if (::benchmark::ReportUnrecognizedArguments(argc, argv)) {
return 1;
}
{
ddc::ScopeGuard const guard;
::benchmark::RunSpecifiedBenchmarks();
}
::benchmark::Shutdown();
return 0;
}
Loading

0 comments on commit 9d7de3d

Please sign in to comment.