CPU internal utils (part1) (#9)

* move more internal utils on cpu * test gpu build
Cytnx-dev · Dec 23, 2024 · e7a9530 · e7a9530
1 parent 46b89e3
commit e7a9530
Show file tree

Hide file tree

Showing 13 changed files with 216 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -20,6 +20,8 @@ message(STATUS " Build Target: ${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
 message(STATUS " Installation Prefix: ${CMAKE_INSTALL_PREFIX}")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
+
+
 # #####################################################################
 # Project
 # #####################################################################
@@ -109,6 +111,14 @@ else()
   target_link_libraries(${PKG_NAME} PUBLIC ${LAPACK_LIBRARIES})
 endif()
 
+# ###########
+# Options
+# ###########
+option(USE_CUDA "Build using Nvidia CUDA for GPU library" OFF)
+if(USE_CUDA)
+  include(cmake/config_cuda.cmake)
+endif()
+
 
 ## install
 include(GNUInstallDirs)

diff --git a/README.md b/README.md
@@ -35,6 +35,10 @@ Running pytest:
 
 * most of the deps should be able to install via pypi.
 
+GPU:
+- CUDA Toolit
+
+
 
 ## Compile directly the C++ package
 

diff --git a/cmake/config_cuda.cmake b/cmake/config_cuda.cmake
@@ -0,0 +1,38 @@
+if(USE_CUDA)
+    message(STATUS " Enable CUDA Support")
+    set(CYTNX_VARIANT_INFO "${CYTNX_VARIANT_INFO} UNI_CUDA")
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+    message(STATUS "CUDA: ${CUDA_TOOLKIT_FOUND}")
+    if(NOT DEFINED CMAKE_CUDA_STANDARD)
+        set(CMAKE_CUDA_STANDARD 17)
+        set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    endif()
+
+
+    set_target_properties(${PKG_NAME} PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+                                    )
+    set_target_properties(${PKG_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe=--display_error_number -lineinfo -m64")
+    #set(CMAKE_CUDA_FLAGS "-Xcompiler=-Wall -Xcompiler=-Wno-deprecated-gpu-targets -Xcudafe=--display_error_number")
+    ##set(CMAKE_CUDA_FLAGS "-Xcompiler=-Wall -Wno-deprecated-gpu-targets -Xcudafe=--display_error_number")
+    ##  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-DUNI_GPU")
+    #  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-arch=sm_50 \
+    #      -gencode=arch=compute_50,code=sm_50 \
+    #      -gencode=arch=compute_52,code=sm_52 \
+    #      -gencode=arch=compute_60,code=sm_60 \
+    #      -gencode=arch=compute_61,code=sm_61 \
+    #      -gencode=arch=compute_70,code=sm_70 \
+    #      -gencode=arch=compute_75,code=sm_75 \
+    #      -gencode=arch=compute_75,code=compute_75 ")
+    set_property(TARGET ${PKG_NAME} PROPERTY CUDA_ARCHITECTURES "80;86;90")
+    #et_property(TARGET ${PKG_NAME} PROPERTIES CUDA_ARCHITECTURES "80;86;90")
+    target_compile_definitions(${PKG_NAME} PUBLIC UNI_GPU)
+    target_include_directories(${PKG_NAME} PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+    target_link_libraries(${PKG_NAME} PUBLIC CUDA::toolkit)
+    target_link_libraries(${PKG_NAME} PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::curand CUDA::cusolver)
+    target_link_libraries(${PKG_NAME} PUBLIC -lcudadevrt)
+else()
+    message( STATUS " Build CUDA Support: NO")
+endif()
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,9 @@ minimum-version = "build-system.requires"
 build-dir = "build/{wheel_tag}"
 build.tool-args = ["-j4"]
 
+[tool.scikit-build.cmake.define]
+USE_CUDA = "OFF"
+
 # only use pypi registry
 [[tool.uv.index]]
 name = "pypi"

diff --git a/src/cpp/src/utils_internal/cpu/Alloc_cpu.cpp b/src/cpp/src/utils_internal/cpu/Alloc_cpu.cpp
@@ -0,0 +1,20 @@
+#include "Alloc_cpu.hpp"
+
+using namespace std;
+
+namespace cytnx_core {
+  namespace utils_internal {
+    void* Calloc_cpu(const cytnx_uint64& N, const cytnx_uint64& perelem_bytes) {
+      void* tmp = calloc(N, perelem_bytes);
+      cytnx_error_msg(((tmp == NULL) && (N > 0)), "[ERROR][calloc] Memory allocation failed.%s",
+                      "\n");
+      return tmp;
+    }
+    void* Malloc_cpu(const cytnx_uint64& bytes) {
+      void* tmp = malloc(bytes);
+      cytnx_error_msg(((tmp == NULL) && (bytes > 0)), "[ERROR][malloc] Memory allocation failed.%s",
+                      "\n");
+      return tmp;
+    }
+  }  // namespace utils_internal
+}  // namespace cytnx_core
diff --git a/src/cpp/src/utils_internal/cpu/Alloc_cpu.hpp b/src/cpp/src/utils_internal/cpu/Alloc_cpu.hpp
@@ -0,0 +1,20 @@
+#ifndef CYTNX_BACKEND_UTILS_INTERNAL_CPU_ALLOC_CPU_H_
+#define CYTNX_BACKEND_UTILS_INTERNAL_CPU_ALLOC_CPU_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <stdint.h>
+#include <climits>
+#include <cytnx_core/Type.hpp>
+#include <cytnx_core/errors/cytnx_error.hpp>
+
+namespace cytnx_core {
+  namespace utils_internal {
+
+    void* Calloc_cpu(const cytnx_uint64& N, const cytnx_uint64& perelem_bytes);
+    void* Malloc_cpu(const cytnx_uint64& bytes);
+
+  }  // namespace utils_internal
+}  // namespace cytnx_core
+
+#endif  // CYTNX_BACKEND_UTILS_INTERNAL_CPU_ALLOC_CPU_H_
diff --git a/src/cpp/src/utils_internal/cpu/CMakeLists.txt b/src/cpp/src/utils_internal/cpu/CMakeLists.txt
@@ -1,6 +1,11 @@
 target_sources_local(cytnx_core
   PRIVATE
 
+  Alloc_cpu.cpp
+  Alloc_cpu.hpp
   Complexmem_cpu.cpp
   Complexmem_cpu.hpp
+  Fill_cpu.hpp
+  SetZeros_cpu.cpp
+  SetZeros_cpu.hpp
 )
diff --git a/src/cpp/src/utils_internal/cpu/Fill_cpu.hpp b/src/cpp/src/utils_internal/cpu/Fill_cpu.hpp
@@ -0,0 +1,37 @@
+#ifndef CYTNX_BACKEND_UTILS_INTERNAL_CPU_FILL_CPU_H_
+#define CYTNX_BACKEND_UTILS_INTERNAL_CPU_FILL_CPU_H_
+
+#include <cytnx_core/Type.hpp>
+
+#ifdef UNI_OMP
+  #include <omp.h>
+#endif
+
+namespace cytnx_core {
+  namespace utils_internal {
+
+    /**
+     * @brief Assign the given value to the first `count` elements in the range beginning at
+     * `first`.
+     *
+     * This function act the same as `std::fill_n`. The execution will be parallelized when OMP is
+     * enabled.
+     *
+     * @tparam DType the data type of the elements in the range
+     *
+     * @param first the beginning of the range
+     * @param value the value to be assigned
+     * @param count the number of elements to modify
+     */
+    template <typename DType>
+    void FillCpu(void *first, const DType &value, cytnx_uint64 count) {
+      DType *typed_first = reinterpret_cast<DType *>(first);
+#pragma omp parallel for schedule(static)
+      for (cytnx_uint64 i = 0; i < count; i++) {
+        typed_first[i] = value;
+      }
+    }
+  }  // namespace utils_internal
+}  // namespace cytnx_core
+
+#endif  // CYTNX_BACKEND_UTILS_INTERNAL_CPU_FILL_CPU_H_
diff --git a/src/cpp/src/utils_internal/cpu/SetZeros_cpu.cpp b/src/cpp/src/utils_internal/cpu/SetZeros_cpu.cpp
@@ -0,0 +1,9 @@
+#include "SetZeros_cpu.hpp"
+
+using namespace std;
+
+namespace cytnx_core {
+  namespace utils_internal {
+    void SetZeros(void* c_ptr, const cytnx_uint64& bytes) { memset(c_ptr, 0, bytes); }
+  }  // namespace utils_internal
+}  // namespace cytnx_core
diff --git a/src/cpp/src/utils_internal/cpu/SetZeros_cpu.hpp b/src/cpp/src/utils_internal/cpu/SetZeros_cpu.hpp
@@ -0,0 +1,19 @@
+#ifndef CYTNX_BACKEND_UTILS_INTERNAL_CPU_SETZEROS_CPU_H_
+#define CYTNX_BACKEND_UTILS_INTERNAL_CPU_SETZEROS_CPU_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <stdint.h>
+#include <climits>
+#include <cytnx_core/Type.hpp>
+#include <cytnx_core/errors/cytnx_error.hpp>
+
+namespace cytnx_core {
+  namespace utils_internal {
+
+    void SetZeros(void* c_ptr, const cytnx_uint64& bytes);
+
+  }
+}  // namespace cytnx_core
+
+#endif  // CYTNX_BACKEND_UTILS_INTERNAL_CPU_SETZEROS_CPU_H_
diff --git a/src/cpp/src/utils_internal/gpu/CMakeLists.txt b/src/cpp/src/utils_internal/gpu/CMakeLists.txt
@@ -0,0 +1,7 @@
+target_sources_local(cytnx_core
+  PRIVATE
+
+  cuAlloc_gpu.cu
+  cuAlloc_gpu.hpp
+
+)
diff --git a/src/cpp/src/utils_internal/gpu/cuAlloc_gpu.cu b/src/cpp/src/utils_internal/gpu/cuAlloc_gpu.cu
@@ -0,0 +1,24 @@
+#include "cuAlloc_gpu.hpp"
+
+using namespace std;
+
+namespace cytnx_core {
+  namespace utils_internal {
+#ifdef UNI_GPU
+    // void* Calloc_cpu(const cytnx_uint64 &N, const cytnx_uint64 &perelem_bytes){
+    //     return calloc(M,perelem_bytes);
+    // }
+    void* cuCalloc_gpu(const cytnx_uint64& N, const cytnx_uint64& perelem_bytes) {
+      void* ptr;
+      checkCudaErrors(cudaMallocManaged((void**)&ptr, perelem_bytes * N));
+      checkCudaErrors(cudaMemset(ptr, 0, perelem_bytes * N));
+      return ptr;
+    }
+    void* cuMalloc_gpu(const cytnx_uint64& bytes) {
+      void* ptr;
+      checkCudaErrors(cudaMallocManaged(&ptr, bytes));
+      return ptr;
+    }
+#endif
+  }  // namespace utils_internal
+}  // namespace cytnx_core
diff --git a/src/cpp/src/utils_internal/gpu/cuAlloc_gpu.hpp b/src/cpp/src/utils_internal/gpu/cuAlloc_gpu.hpp
@@ -0,0 +1,20 @@
+#ifndef CYTNX_BACKEND_UTILS_INTERNAL_GPU_CUALLOC_GPU_H_
+#define CYTNX_BACKEND_UTILS_INTERNAL_GPU_CUALLOC_GPU_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <stdint.h>
+#include <climits>
+#include <cytnx_core/Type.hpp>
+#include <cytnx_core/errors/cytnx_error.hpp>
+namespace cytnx_core {
+  namespace utils_internal {
+
+#ifdef UNI_GPU
+    void* cuCalloc_gpu(const cytnx_uint64& N, const cytnx_uint64& perelem_bytes);
+    void* cuMalloc_gpu(const cytnx_uint64& bytes);
+#endif
+  }  // namespace utils_internal
+}  // namespace cytnx_core
+
+#endif  // CYTNX_BACKEND_UTILS_INTERNAL_GPU_CUALLOC_GPU_H_
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,6 +35,10 @@ Running pytest: @@
     * most of the deps should be able to install via pypi.
+    GPU:
+    - CUDA Toolit
     ## Compile directly the C++ package
@@ Expand Down @@