.

zama-ai · Jul 25, 2024 · a4b5851 · a4b5851
1 parent ef33376
commit a4b5851
Show file tree

Hide file tree

Showing 33 changed files with 1,209 additions and 354 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/functions.h b/backends/tfhe-cuda-backend/cuda/include/functions.h
@@ -0,0 +1,16 @@
+#ifndef CUDA_FUNCTIONS_H_
+#define CUDA_FUNCTIONS_H_
+
+#include <cstdint>
+#include "polynomial/parameters.cuh"
+#include "polynomial/functions.cuh"
+
+extern "C" {
+void cuda_glwe_sample_extract_64(
+        void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out,
+        void *glwe_array_in, uint32_t *nth_array, uint32_t num_samples, uint32_t glwe_dimension,
+        uint32_t
+        polynomial_size);
+}
+
+#endif
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -36,6 +36,11 @@ enum COMPARISON_TYPE {
   MIN = 7,
 };
 
+enum COMPRESSION_MODE {
+  COMPRESS = 0,
+  DECOMPRESS = 1,
+};
+
 enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
 
 enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
@@ -203,6 +208,29 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
                                      uint32_t gpu_count, int8_t **mem_ptr_void);
 
+void scratch_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+    bool allocate_gpu_memory);
+
+void cuda_compression_compress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
+    int8_t *mem_ptr);
+
+void cuda_compression_decompress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
+    int8_t *mem_ptr);
+
+void cleanup_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
 void scratch_cuda_integer_radix_bitop_kb_64(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
     uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -792,6 +820,37 @@ template <typename Torus> struct int_radix_lut {
   }
 };
 
+template <typename Torus> struct int_compression {
+  COMPRESSION_MODE mode;
+  int_radix_params params;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+
+  Torus *tmp_lwe_shifted;
+
+  int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                  uint32_t gpu_count, int_radix_params params,
+                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
+                  uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+                  bool allocate_gpu_memory) {
+    this->mode = mode;
+    this->params = params;
+    this->lwe_per_glwe = lwe_per_glwe;
+    this->storage_log_modulus = storage_log_modulus;
+
+    if (allocate_gpu_memory) {
+      int glwe_accumulator_size =
+          (params.glwe_dimension + 1) * params.polynomial_size;
+      tmp_lwe_shifted = (Torus *)cuda_malloc_async(
+          num_radix_blocks * (params.big_lwe_dimension + 1), streams[0],
+          gpu_indexes[0]);
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+      cuda_drop_async(tmp_lwe_shifted, streams[0], gpu_indexes[0]);
+  }
+};
 template <typename Torus> struct int_bit_extract_luts_buffer {
   int_radix_params params;
   int_radix_lut<Torus> *lut;

diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -30,7 +30,6 @@ void cuda_fp_keyswitch_lwe_list_to_glwe_64(
     void *fp_ksk_array, uint32_t input_lwe_dimension,
     uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
     uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
 }
 
 #endif // CNCRT_KS_H_
diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,17 +1,3 @@
-set(SOURCES
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t *>(lwe_output_indexes),

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -147,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
         GET_VARIANT_ELEMENT(lwe_input_indexes, i);
 
     // Compute Keyswitch
-    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
+    host_keyswitch_lwe_ciphertext_vector<Torus>(
         streams[i], gpu_indexes[i], current_lwe_array_out,
         current_lwe_output_indexes, current_lwe_array_in,
         current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -2,7 +2,9 @@
 #define CNCRT_TORUS_CUH
 
 #include "types/int128.cuh"
+#include "utils/kernel_dimensions.cuh"
 #include <limits>
+#include "device.h"
 
 template <typename T>
 __device__ inline void typecast_double_to_torus(double x, T &r) {
@@ -39,19 +41,41 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
 }
 
 template <typename T>
-__device__ __forceinline__ void modulus_switch(T input, T &output,
-                                               uint32_t log_modulus) {
+__device__ __forceinline__ void apply_modulus_switch(T input, T &output,
+                                                     uint32_t log_modulus) {
   constexpr uint32_t BITS = sizeof(T) * 8;
 
   output = input + (((T)1) << (BITS - log_modulus - 1));
   output >>= (BITS - log_modulus);
 }
 
 template <typename T>
-__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
+__device__ __forceinline__ T apply_modulus_switch(T input,
+                                                  uint32_t log_modulus) {
   T output;
-  modulus_switch(input, output, log_modulus);
+  apply_modulus_switch(input, output, log_modulus);
   return output;
 }
 
+template <typename Torus>
+__global__ void apply_modulus_switch_inplace(Torus *array, int size,
+                                             uint32_t log_modulus) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    array[tid] = apply_modulus_switch(array[tid], log_modulus);
+  }
+}
+
+template <typename Torus>
+__host__ void modulus_switch(cudaStream_t stream, uint32_t gpu_index,
+                             Torus *array, int size, uint32_t log_modulus) {
+
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
+
+  apply_modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(
+      array, size, log_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -0,0 +1,51 @@
+#include "compression.cuh"
+
+void scratch_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);
+
+  scratch_cuda_compression_integer_radix_ciphertext_64(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_compression<uint64_t> **)mem_ptr, num_lwes, params, lwe_per_glwe,
+      storage_log_modulus, mode, allocate_gpu_memory);
+}
+void cuda_compression_compress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
+    int8_t *mem_ptr) {
+
+  host_integer_compression_compress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_lwes,
+      (int_compression<uint64_t> *)mem_ptr);
+}
+void cuda_compression_decompress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_out,
+    void *glwe_array_in, void **bsks, int8_t *mem_ptr) {
+
+    host_integer_compression_decompress<uint64_t>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_out),
+        static_cast<uint64_t *>(glwe_array_in),bsks,
+        (int_compression<uint64_t> *)mem_ptr);
+}
+
+void cleanup_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_compression<uint64_t> *mem_ptr =
+      (int_compression<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -0,0 +1,68 @@
+#ifndef CUDA_INTEGER_COMPRESSION_CUH
+#define CUDA_INTEGER_COMPRESSION_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/integer.cuh"
+#include "linearalgebra/multiplication.cuh"
+#include "polynomial/functions.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename Torus>
+__host__ void host_integer_compression_compress(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *glwe_array_out, Torus *lwe_array_in, Torus **fp_ksk,
+    uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
+  auto params = mem_ptr->params;
+
+  // Shift
+  auto lwe_shifted = mem_ptr->tmp_lwe_shifted;
+  host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
+                                lwe_array_in, (uint64_t)params.message_modulus,
+                                params.big_lwe_dimension, num_lwes);
+
+  uint32_t lwe_in_size = params.big_lwe_dimension + 1;
+  uint32_t glwe_out_size = (params.glwe_dimension + 1) * params.polynomial_size;
+  uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe;
+
+  // Keyswitch LWEs to GLWE
+  for (int i = 0; i < num_glwes; i++) {
+    auto lwe_subset = lwe_shifted + i * lwe_in_size;
+    auto glwe_out = glwe_array_out + i * glwe_out_size;
+
+    host_fp_keyswitch_lwe_list_to_glwe(
+        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+        params.big_lwe_dimension, params.glwe_dimension, params.polynomial_size,
+        params.ks_base_log, params.ks_level, mem_ptr->lwe_per_glwe);
+  }
+
+  // Modulus switch
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(glwe_out_size, 512, num_blocks, num_threads);
+  apply_modulus_switch_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
+      glwe_array_out, num_glwes * glwe_out_size, mem_ptr->storage_log_modulus);
+}
+
+
+template <typename Torus>
+__host__ void host_integer_compression_decompress(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_out, Torus *glwe_array_in, void **bsks,
+    int_compression<Torus> *mem_ptr) {
+
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_compression_integer_radix_ciphertext_64(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_compression<Torus> **mem_ptr, uint32_t num_lwes,
+    int_radix_params params, uint32_t lwe_per_glwe,
+    uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_compression<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_lwes, lwe_per_glwe,
+      storage_log_modulus, mode, allocate_gpu_memory);
+}
+#endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint32_t *>(lwe_array_out),
-                                static_cast<uint32_t *>(lwe_array_in),
-                                static_cast<uint32_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
  * Perform the multiplication of a u64 input LWE ciphertext vector with a u64
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint64_t *>(lwe_array_out),
-                                static_cast<uint64_t *>(lwe_array_in),
-                                static_cast<uint64_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }