diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 14a90b345..fcef58c4e 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -61,7 +61,7 @@ endif OCCAFLAGS = -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca .PHONY: all clean vector valarray openmp target opencl taskloop stl \ - ranges kokkos raja cuda cublas sycl dpcpp \ + ranges kokkos raja cuda cudastf cublas sycl dpcpp \ boost-compute thrust executor oneapi onemkl EXTRA= @@ -122,6 +122,9 @@ raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \ cuda: stencil-cuda transpose-cuda nstream-cuda nstream-managed-cuda +stf: cudastf +cudastf: dgemm-cudastf dgemm-cublas-cudastf stencil-cudastf p2p-cudastf transpose-cudastf + hip: nstream-hip transpose-hip stencil-hip nstream-managed-hip hipstl: nstream-hipstl @@ -306,6 +309,14 @@ endif %.s: %.cc prk_util.h $(CXX) $(CXXFLAGS) $(ASMFLAGS) -S $< -o $@ +%-cudastf: %-cudastf.cu + $(NVCC) $(CPPFLAGS) $(CUDAFLAGS) $(CUDASTF_CFLAGS) $(CUDASTF_LDFLAGS) $< -o $@ + +%-cublas-cudastf: %-cublas-cudastf.cu prk_util.h prk_cuda.h + $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $(CUDASTF_CFLAGS) $(CUDASTF_LDFLAGS) -DPRK_USE_CUBLAS $< -lcublas -lcublasLt -o $@ + + + clean: -rm -f *.o -rm -f *.s diff --git a/Cxx11/dgemm-cublas-cudastf.cu b/Cxx11/dgemm-cublas-cudastf.cu new file mode 100644 index 000000000..8078b7961 --- /dev/null +++ b/Cxx11/dgemm-cublas-cudastf.cu @@ -0,0 +1,309 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// Copyright (c) 2024, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: dgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// cblasDgemm() +/// cublasDgemmStridedBatched() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// CUDA STF by Cedric Augonnet, October 2024. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" + +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/CUBLAS STF Dense matrix-matrix multiplication: C += A x B" << std::endl; + + prk::CUDA::info info; + //info.print(); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + int batches = 0; + int input_copy = 0; + try { + if (argc < 2) { + throw "Usage: <# iterations> [] []"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + if (argc > 3) { + batches = std::atoi(argv[3]); + } + + if (argc > 4) { + input_copy = std::atoi(argv[4]); + if (input_copy != 0 && input_copy != 1) { + throw "ERROR: input_copy was not 0 or 1"; + } + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + if (batches == 0) { + std::cout << "No batching" << std::endl; + } else if (batches < 0) { + std::cout << "Batch size = " << -batches << " (loop over legacy BLAS)" << std::endl; + } else if (batches > 0) { + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; + } + std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; + + cublasHandle_t h; + prk::CUDA::check( cublasCreate(&h) ); + + const int tile_size = 32; + dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); + + info.checkDims(dimBlock, dimGrid); + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + double gemm_time(0); + + const int matrices = (batches==0 ? 1 : abs(batches)); + const size_t nelems = (size_t)order * (size_t)order; + + const auto epsilon = 1.0e-8; + const auto forder = static_cast(order); + const auto reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + double residuum(0); + + context ctx; + + if (batches > 0) { + /* + * BATCHED implementation + */ + auto a = ctx.logical_data(shape_of>(order, order, matrices)); + auto b = ctx.logical_data(shape_of>(order, order, matrices)); + auto c = ctx.logical_data(shape_of>(order, order, matrices)); + + // Initialize all matrices + ctx.parallel_for(a.shape(), a.write(), b.write(), c.write())->*[] __device__ (size_t i, size_t j, size_t k, auto da, auto db, auto dc) + { + da(i, j, k) = (double)i; + db(i, j, k) = (double)i; + dc(i, j, k) = 0.0; + }; + + for (int iter = 0; iter<=iterations; iter++) { + if (iter==1) { + cudaStreamSynchronize(ctx.task_fence()); + gemm_time = prk::wtime(); + } + + const double alpha = 1.0; + const double beta = 1.0; + ctx.task(a.read(), b.read(), c.rw())->*[&](cudaStream_t stream, auto da, auto db, auto dc) { + cublasSetStream(h, stream); + prk::CUDA::check( cublasDgemmStridedBatched(h, + CUBLAS_OP_N, CUBLAS_OP_N, + order, order, order, + &alpha, + (const double *)da.data_handle(), order, order*order, + (const double *)db.data_handle(), order, order*order, + &beta, + dc.data_handle(), order, order*order, + batches) ); + }; + } + + cudaStreamSynchronize(ctx.task_fence()); + gemm_time = prk::wtime() - gemm_time; + + ctx.host_launch(c.read())->*[&](auto hc) + { + for (size_t k = 0; k < hc.extent(2); k++) + { + double checksum = 0.0; + + for (size_t j = 0; j < hc.extent(1); j++) + for (size_t i = 0; i < hc.extent(0); i++) + { + checksum += hc(i, j, k); + } + + residuum += std::abs(checksum-reference)/reference; + } + residuum /= matrices; + }; + } + else { + ::std::vector>> vector_a; + ::std::vector>> vector_b; + ::std::vector>> vector_c; + + // Initialize independant matrices + for (size_t k = 0; k < matrices; k++) { + auto ak = ctx.logical_data(shape_of>(order, order)); + auto bk = ctx.logical_data(shape_of>(order, order)); + auto ck = ctx.logical_data(shape_of>(order, order)); + + vector_a.push_back(ak); + vector_b.push_back(bk); + vector_c.push_back(ck); + + ctx.parallel_for(ak.shape(), ak.write(), bk.write(), ck.write())->*[] __device__ (size_t i, size_t j, auto dak, auto dbk, auto dck) + { + dak(i, j) = (double)i; + dbk(i, j) = (double)i; + dck(i, j) = 0.0; + }; + } + + for (int iter = 0; iter<=iterations; iter++) { + if (iter==1) { + cudaStreamSynchronize(ctx.task_fence()); + gemm_time = prk::wtime(); + } + + const double alpha = 1.0; + const double beta = 1.0; + + for (size_t k = 0; k < matrices; k++) + { + ctx.task(vector_a[k].read(), vector_b[k].read(), vector_c[k].rw())->*[&](cudaStream_t stream, auto dA, auto dB, auto dC) { + cublasSetStream(h, stream); + prk::CUDA::check( cublasDgemm(h, + CUBLAS_OP_N, CUBLAS_OP_N, // opA, opB + order, order, order, // m, n, k + &alpha, // alpha + dA.data_handle(), order, // A, lda + dB.data_handle(), order, // B, ldb + &beta, // beta + dC.data_handle(), order) ); // C, ldc + }; + } + } + + cudaStreamSynchronize(ctx.task_fence()); + gemm_time = prk::wtime() - gemm_time; + + for (size_t k = 0; k < matrices; k++) + { + double checksum = 0.0; + ctx.host_launch(vector_c[k].read())->*[&](auto hck) + { + for (size_t j = 0; j < hck.extent(1); j++) + for (size_t i = 0; i < hck.extent(0); i++) + { + checksum += hck(i, j); + } + }; + + cudaStreamSynchronize(ctx.task_fence()); + residuum += std::abs(checksum-reference)/reference; + + } + residuum /= matrices; + + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + if (residuum < epsilon) { +#if VERBOSE + std::cout << "Reference checksum = " << reference << "\n" + << "Actual checksum = " << checksum << std::endl; +#endif + std::cout << "Solution validates" << std::endl; + auto avgtime = gemm_time/iterations/matrices; + auto nflops = 2.0 * prk::pow(forder,3); + prk::print_flop_rate_time("FP64", nflops/avgtime, avgtime); + } else { + std::cout << "Reference checksum = " << reference << "\n" + << "Residuum = " << residuum << std::endl; + return 1; + } + + ctx.finalize(); + + return 0; +} + + diff --git a/Cxx11/dgemm-cudastf.cu b/Cxx11/dgemm-cudastf.cu new file mode 100644 index 000000000..89f24d38a --- /dev/null +++ b/Cxx11/dgemm-cudastf.cu @@ -0,0 +1,199 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// Copyright (c) 2024, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: dgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// CUDA STF by Cedric Augonnet, October 2024. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char * argv[]) +{ + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11 CUDA STF Dense matrix-matrix multiplication: C += A x B" << std::endl; + + int iterations; + int order; + try { + if (argc < 3) { + throw "Usage: <# iterations> [tile size]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + context ctx; + + double dgemm_time{0}; + + auto A = ctx.logical_data(shape_of>(order, order)); + auto B = ctx.logical_data(shape_of>(order, order)); + auto C = ctx.logical_data(shape_of>(order, order)); + + ctx.parallel_for(A.shape(), A.write(), B.write(), C.write())->*[]__device__(size_t i, size_t j, auto dA, auto dB, auto dC) + { + dA(i, j) = (double)i; + dB(i, j) = (double)i; + dC(i, j) = 0.0; + }; + + { + for (int iter = 0; iter<=iterations; iter++) { + if (iter==1) { + cudaStreamSynchronize(ctx.task_fence()); + dgemm_time = prk::wtime(); + } + + ctx.parallel_for(C.shape(), A.read(), B.read(), C.rw())->*[]__device__ (size_t i, size_t j, auto dA, auto dB, auto dC) + { + double Ctemp(0); + for (size_t k = 0; k < dC.extent(0); k++) { + Ctemp += dA(i, k)*dB(k, j); + } + dC(i,j) += Ctemp; + }; + } + + cudaStreamSynchronize(ctx.task_fence()); + dgemm_time = prk::wtime() - dgemm_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const auto forder = static_cast(order); + const auto reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1); + + double checksum; + ctx.host_launch(C.read())->*[&](auto hC) + { + for (size_t j = 0; j < hC.extent(1); j++) + for (size_t i = 0; i < hC.extent(0); i++) + { + checksum += hC(i, j); + } + }; + + cudaStreamSynchronize(ctx.task_fence()); + + const auto epsilon = 1.0e-8; + const auto residuum = prk::abs(checksum-reference)/reference; + if (residuum < epsilon) { +#if VERBOSE + std::cout << "Reference checksum = " << reference << "\n" + << "Actual checksum = " << checksum << std::endl; +#endif + std::cout << "Solution validates" << std::endl; + auto avgtime = dgemm_time/iterations; + auto nflops = 2.0 * prk::pow(forder,3); + std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "Reference checksum = " << reference << "\n" + << "Actual checksum = " << checksum << std::endl; +#if VERBOSE + for (int i=0; i +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C99-ification by Jeff Hammond, February 2016. +/// C++11-ification by Jeff Hammond, May 2017. +/// CUDASTF version by Cedric Augonnet, November 2024. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" + +#include +#include + +#include + +using namespace cuda::experimental::stf; + + +#define BLOCK_SIZE 32 + +#if 0 + +#define HALO_SIZE 1 + +__global__ void p2p(int N, double * M) +{ + __shared__ float sm_buffer[BLOCK_SIZE + HALO_SIZE][BLOCK_SIZE + HALO_SIZE]; + + int bx = blockIdx.x; + int tx = threadIdx.x; + int dx = blockDim.x; + + cooperative_groups::grid_group cuda_grid = cooperative_groups::this_grid(); + + for(int i = 0; i < 2*N/BLOCK_SIZE; ++i) { + + //Compute matrix coordinates for block corner + int g_x = bx * BLOCK_SIZE + HALO_SIZE; + int g_y = (i - bx) * BLOCK_SIZE + HALO_SIZE; + + //Check block is in bounds + if(g_y >= 0 && g_y < (N + HALO_SIZE)) { + + // load halo to SM + sm_buffer[0][tx + 1] = M[(g_y - 1) * (N + HALO_SIZE) + g_x + tx]; + sm_buffer[tx + 1][0] = M[(g_y + tx)* (N + HALO_SIZE) + g_x - 1]; + if(tx == 0) //Load corner + sm_buffer[0][0] = M[(g_y - 1) * (N + HALO_SIZE) + g_x - 1]; + + // inner loop + for(int j = 0; j <= 2*BLOCK_SIZE; j++) { + int l_x = tx + HALO_SIZE; + int l_y = j - tx + HALO_SIZE; + if (l_y >= 1 && l_y <= BLOCK_SIZE) + sm_buffer[l_y][l_x] = sm_buffer[l_y - 1][l_x] + sm_buffer[l_y][l_x - 1] - sm_buffer[l_y - 1][l_x - 1]; + } + + // flush block to memory + for(int j = 0; j <= BLOCK_SIZE; j++) + M[(g_y + j) * (N + HALO_SIZE) + g_x + tx] = sm_buffer[j + HALO_SIZE][tx + HALO_SIZE]; + + // sync threads + cuda_grid.sync(); + } + } + +#if 0 + // one thread copies the bottom right corner to the top left corner... + if ((bx * dx + tx) == 0) { + M[0] = -M[(N-1)*(N+HALO_SIZE)+(N-1)]; + } + cuda_grid.sync(); // required? +#endif +} +#else +__global__ void p2p(double * grid, const int n) +{ + const int bx = blockIdx.x; + const int tx = threadIdx.x; + const int dx = blockDim.x; + + const int j = bx * dx + tx + 1; + + cooperative_groups::grid_group cuda_grid = cooperative_groups::this_grid(); + + for (int i=2; i<=2*n-2; i++) { + //parallel_for (int j=std::max(2,i-n+2); j<=std::min(i,n); j++) { + if (MAX(2,i-n+2) <= j && j <= MIN(i,n)) { + const int x = i-j+1; + const int y = j-1; + grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; + } + //__threadfence(); + cuda_grid.sync(); + //__threadfence(); + } + + // one thread copies the bottom right corner to the top left corner... + if (j == 1) { + grid[0*n+0] = -grid[(n-1)*n+(n-1)]; + } + //__threadfence(); + //cuda_grid.sync(); // required? + //__threadfence(); +} +#endif + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/CUDASTF pipeline execution on 2D grid" << std::endl; + + prk::CUDA::info info; + info.print(); + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int n; + try { + if (argc < 3) { + throw " <# iterations> "; + } + + // number of times to run the pipeline algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // grid dimensions + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimensions must be positive"; + } else if ( n > prk::get_max_matrix_size() ) { + throw "ERROR: grid dimension too large - overflow risk"; + } else if (n % BLOCK_SIZE) { + throw "ERROR: grid dimension is not a multiple of BLOCK_SIZE"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid sizes = " << n << ", " << n << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double pipeline_time{0}; // silence compiler warning + + context ctx; + auto grid = ctx.logical_data(shape_of>(n, n)); + + ctx.parallel_for(grid.shape(), grid.write())->*[] __device__ (size_t i, size_t j, auto d_grid) + { + d_grid(i, j) = 0.0; + }; + + // initialize boundary conditions + ctx.parallel_for(box(n), grid.write())->*[] __device__ (size_t i, auto d_grid) + { + d_grid(i, 0) = static_cast(i); + d_grid(0, i) = static_cast(i); + }; + +#ifdef DEBUG + ctx.host_launch(grid.read())->*[n](auto h_grid) { + std::cout << "B h_grid=\n"; + for (int i=0; i(hw_scope::block)); + ctx.launch(spec, grid.rw())->*[n]__device__(auto th, auto d_grid) { + for (int i=2; i<=2*n-2; i++) { + for (int j = th.rank() + 1; j <= n; j += th.size()) { + if (MAX(2,i-n+2) <= j && j <= MIN(i,n)) { + const int x = i-j+1; + const int y = j-1; + d_grid(x, y) = d_grid(x - 1, y) + d_grid(x, y-1) - d_grid(x-1, y-1); + } + } + + th.sync(); + } + + // one thread copies the bottom right corner to the top left corner... + if (th.rank() == 0) { + d_grid(0, 0) = -d_grid(n-1, n-1); + } + }; + +#ifdef DEBUG + ctx.host_launch(grid.read())->*[n](auto h_grid) { + std::cout << "h_grid=\n"; + for (int i=0; i*[&](auto h_grid) { + corner_res = h_grid(n - 1, n - 1); + }; + + cudaStreamSynchronize(ctx.task_fence()); + + if ( (prk::abs(corner_res - corner_val)/corner_val) > epsilon) { + std::cout << "ERROR: checksum " << corner_res + << " does not match verification value " << corner_val << std::endl; + return 1; + } + +#ifdef VERBOSE + std::cout << "Solution validates; verification value = " << corner_val << std::endl; +#else + std::cout << "Solution validates" << std::endl; +#endif + auto avgtime = pipeline_time/iterations; + std::cout << "Rate (MFlops/s): " + << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime + << " Avg time (s): " << avgtime << std::endl; + + ctx.finalize(); + + + + return 0; +} diff --git a/Cxx11/stencil-cudastf.cu b/Cxx11/stencil-cudastf.cu new file mode 100644 index 000000000..83778f6fc --- /dev/null +++ b/Cxx11/stencil-cudastf.cu @@ -0,0 +1,255 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2024, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C++11-ification by Jeff Hammond, May 2017. +/// CUDA STF by Cedric Augonnet, October 2024. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" +#include "stencil_cuda.hpp" +#include + +using namespace cuda::experimental::stf; + +__global__ void nothing(const int n, const prk_float * in, prk_float * out) +{ + //printf("You are trying to use a stencil that does not exist.\n"); + //printf("Please generate the new stencil using the code generator.\n"); + // n will never be zero - this is to silence compiler warnings. + //if (n==0) printf("in=%p out=%p\n", in, out); + //abort(); +} + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/CUDA STF Stencil execution on 2D grid" << std::endl; + + prk::CUDA::info info; + info.print(); + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, n, radius, tile_size; + bool star = true; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > prk::get_max_matrix_size()) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + if (tile_size > 32) { + std::cout << "Warning: tile_size > 32 may lead to incorrect results (observed for CUDA 9.0 on GV100).\n"; + } + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } + + dim3 dimGrid(prk::divceil(n,tile_size),prk::divceil(n,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); + info.checkDims(dimBlock, dimGrid); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double stencil_time{0}; + + context ctx; + + auto in = ctx.logical_data(shape_of>(n, n)); + auto out = ctx.logical_data(shape_of>(n, n)); + + ctx.parallel_for(in.shape(), in.write(), out.write())->*[]__device__(size_t i, size_t j, auto d_in, auto d_out) + { + d_in(i, j) = static_cast(i+j); + d_out(i, j) = static_cast(0); + }; + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) { + cudaStreamSynchronize(ctx.task_fence()); + stencil_time = prk::wtime(); + } + + // Apply the stencil operator + ctx.task(in.read(), out.rw())->*[&](cudaStream_t stream, auto d_in, auto d_out) + { + stencil<<>>(n, d_in.data_handle(), d_out.data_handle()); + }; + + // Add constant to solution to force refresh of neighbor data, if any + ctx.parallel_for(in.shape(), in.rw())->*[] __device__ (size_t i, size_t j, auto d_in) + { + d_in(i, j) += (prk_float)1; + }; + } + + cudaStreamSynchronize(ctx.task_fence()); + + stencil_time = prk::wtime() - stencil_time; + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + // interior of grid with respect to stencil + double norm = 0.0; + size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); + ctx.host_launch(out.read())->*[&](auto h_out) + { + for (int i=radius; i epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + ctx.finalize(); + + return 0; +} diff --git a/Cxx11/transpose-cudastf.cu b/Cxx11/transpose-cudastf.cu new file mode 100644 index 000000000..ade42aced --- /dev/null +++ b/Cxx11/transpose-cudastf.cu @@ -0,0 +1,167 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// CUDASTF version by Cedric Augonnet, November 2024. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include + +using namespace cuda::experimental::stf; + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++17/CUDASTF Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to do the transpose + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // order of a the matrix + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > prk::get_max_matrix_size()) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + context ctx; + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double trans_time{0}; + + auto A = ctx.logical_data(shape_of>(order, order)); + auto B = ctx.logical_data(shape_of>(order, order)); + + ctx.parallel_for(A.shape(), A.write(), B.write())->*[] __device__ (size_t i, size_t j, auto dA, auto dB) + { + dA(i, j) = static_cast(i*dA.extent(0)+j); + dB(i, j) = 0.0; + }; + + for (int iter = 0; iter<=iterations; iter++) { + if (iter==1) { + cudaStreamSynchronize(ctx.task_fence()); + trans_time = prk::wtime(); + } + + ctx.parallel_for(B.shape(), A.rw(), B.rw())->*[] __device__ (size_t i, size_t j, auto dA, auto dB) + { + dB(i, j) += dA(j, i); + dA(j, i) += 1.0; + }; + } + + cudaStreamSynchronize(ctx.task_fence()); + trans_time = prk::wtime() - trans_time; + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double abserr(0); + ctx.host_launch(B.read())->*[&](auto hB) { + const auto addit = (iterations+1.) * (iterations/2.); + for (int j=0; j(ij)*(1.+iterations)+addit; + abserr += prk::abs(hB(j, i) - reference); + } + } + }; + + ctx.finalize(); + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + diff --git a/ci/install-cudastf.sh b/ci/install-cudastf.sh new file mode 100644 index 000000000..7bb845528 --- /dev/null +++ b/ci/install-cudastf.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +CI_ROOT="$1" + +cd $CI_ROOT + +TARGETDIR=$CI_ROOT/stf/ + +mkdir -p $TARGETDIR + +git clone https://github.com/NVIDIA/cccl.git $TARGETDIR/cccl/ diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 6c573d51e..97e6040c2 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -1,16 +1,24 @@ # -# This file shows the CUDA+GCC toolchain options. +# This file shows the CUDA toolchain options +# for both NVHPC and GCC. +NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/24.9 +#NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11 +#NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021 +NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/ # # Base compilers and language options # VERSION= # C99 is required in some implementations. -CC=gcc${VERSION} -std=gnu11 +#CC=gcc${VERSION} -std=gnu11 +CC=${NVHPC_CBIN}nvc -c11 -march=zen4 #EXTRA_CLIBS=-lrt # All of the Fortran code is written for the 2008 standard and requires preprocessing. -FC=gfortran${VERSION} -std=f2008 -cpp +#FC=gfortran${VERSION} -std=f2008 -cpp +FC=${NVHPC_CBIN}nvfortran -DNVHPC -march=zen4 # C++11 may not be required but does no harm here. -CXX=g++${VERSION} -std=gnu++17 +#CXX=g++${VERSION} -std=gnu++17 +CXX=${NVHPC_CBIN}nvc++ -std=gnu++20 -march=zen4 # # Compiler flags # @@ -30,16 +38,30 @@ DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations # # OpenMP flags # -OPENMPFLAG=-fopenmp -OPENMPSIMDFLAG=-fopenmp-simd -OFFLOADFLAG=-foffload="-O3 -v" +#OPENMPFLAG=-fopenmp +#OPENMPSIMDFLAG=-fopenmp-simd +#OFFLOADFLAG=-foffload="-O3 -v" +#OPENACCFLAG=-fopenacc +OPENMPFLAG=-mp +#OPENMPFLAG+=-Minfo=mp,vect +OPENMPSIMDFLAG= +OFFLOADFLAG=-mp -target=gpu -gpu=managed +OFFLOADFLAG+=-Minfo=accel OFFLOADFLAG+=-DGPU_SCHEDULE="schedule(static,1)" -OPENACCFLAG=-fopenacc +OPENACCFLAG=-acc -target=gpu +OPENACCFLAG+=-Mlarge_arrays +OPENACCFLAG+=-Minfo=accel +STDPARFLAG=-stdpar=gpu -gpu=managed +STDPARFLAG+=-Minfo=accel +STDPARFLAG+=-cudalib=cublas,cutensor +CUFORTFLAG=-cuda -gpu=managed -acc # ACC required for CUF+managed +CUFORTFLAG+=-Minfo=accel # # OpenCL flags # -OPENCLDIR=/usr/local/cuda-12.1/targets/x86_64-linux -OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib64 -lOpenCL +#OPENCLDIR=/usr/local/cuda-12.6/targets/x86_64-linux +OPENCLDIR=/usr/local/cuda/targets/$$(uname -m)-linux +OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib64 -L${OPENCLDIR}/lib -lOpenCL #OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations #OPENCLFLAG+=-Wno-deprecated-declarations -Wno-missing-braces # @@ -110,6 +132,7 @@ OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib64 -lOpenCL # TBBDIR=/usr/local/Cellar/tbb/2019_U5_1 TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +#TBBFLAG=-L/usr/lib/$$(uname -m)-linux-gnu -ltbb # # Parallel STL, Boost, etc. # @@ -118,15 +141,17 @@ RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -I./pstl/include ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/gcc +PRK_KOKKOS_BACKEND=Cuda +KOKKOSCXX=${KOKKOSDIR}/bin/nvcc_wrapper KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} RAJADIR=/opt/raja/gcc RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} -THRUSTDIR=/opt/nvidia/thrust +THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11/compilers/include-stdpar THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # # CBLAS for C++ DGEMM # -BLASFLAG=-L/usr/lib/x86_64-linux-gnu/blis-openmp -lblis +BLASFLAG=-L${NVHPC_PATH}/REDIST/compilers/lib -lblas CBLASFLAG=${BLASFLAG} # # CUDA flags @@ -136,12 +161,19 @@ CBLASFLAG=${BLASFLAG} # Linux w/ NVIDIA CUDA # NVCC never supports the latest GCC. # Use appropriate arch or code is compiled to ancient features. -NVCC=/usr/local/cuda-12.1/bin/nvcc -CUDAFLAGS=-g -O3 -std=c++11 +#NVCC=${NVHPC_CBIN}nvc++ +#NVCC=${NVHPC_CBIN}nvcc +NVCC=/usr/local/cuda-12.6/bin/nvcc +CUDAFLAGS=-g -O3 -std=c++20 +CUDAFLAGS+=--extended-lambda CUDAFLAGS+=--gpu-architecture=sm_89 -#CUDAFLAGS+=--compiler-bindir=/swtools/gcc/7.5.0/bin +#CUDAFLAGS+=-allow-unsupported-compiler +#CUDAFLAGS+=-ccbin=g++-13 -lm #-lstdc++ +#CUDAFLAGS+=--compiler-bindir=/opt/gcc/12.3.0/bin/ #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' +#CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/include +#CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/lib # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 # heavy hammer: CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED @@ -165,16 +197,28 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED #CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED #CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED # +# CUDASTF +# +CUDASTF_DIR=../deps/stf/ +CUDASTF_CFLAGS=--expt-relaxed-constexpr +CUDASTF_CFLAGS+=-I${CUDASTF_DIR}/cccl/cudax/include/ +CUDASTF_CFLAGS+=-I${CUDASTF_DIR}/cccl/libcudacxx/include +CUDASTF_CFLAGS+=-lcuda +# # MPI-3 # -MPIDIR=/usr/lib/aarch64-linux-gnu/openmpi -MPICC=mpicc -MPICXX=mpicxx -MPIFC=mpifort +# mpiicc wraps icc. mpicc and mpigcc wrap gcc. +#MPIDIR=${NVHPC_PATH}/comm_libs/hpcx +MPIDIR=${NVHPC_PATH}/comm_libs/12.6/openmpi4/latest +MPICC=${MPIDIR}/bin/mpicc +MPICXX=${MPIDIR}/bin/mpicxx +MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi +#MPILIB+=-Wl,-rpath -Wl,${MPIDIR}/lib -Wl,--enable-new-dtags # NVCC chokes on -Wl +MPILIB+=-lopen-pal -lopen-rte #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi -#MPIINC=-I/usr/include/mpich-3.2-x86_64 +#MPIINC=-I/usr/include/mpich-3.2-$$(uname -m) #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi # # Global Arrays @@ -202,4 +246,3 @@ PETSCFLAG+=-Wl,-rpath=${PETSCDIR}/lib COARRAYFLAG=-fcoarray=single -lcaf_single # multi-node # COARRAYFLAG=-fcoarray=lib -lcaf_mpi -