feat(gpu): implement CompressedCudaCiphertextList, and public functio…

…nal packing keyswitch
zama-ai · Jul 25, 2024 · 9c29d8d · 9c29d8d
1 parent d3ea654
commit 9c29d8d
Show file tree

Hide file tree

Showing 33 changed files with 1,873 additions and 84 deletions.
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -36,6 +36,11 @@ enum COMPARISON_TYPE {
   MIN = 7,
 };
 
+enum COMPRESSION_MODE {
+  COMPRESS = 0,
+  DECOMPRESS = 1,
+};
+
 enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
 
 enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
@@ -203,6 +208,29 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
                                      uint32_t gpu_count, int8_t **mem_ptr_void);
 
+void scratch_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+    bool allocate_gpu_memory);
+
+void cuda_compression_compress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
+    int8_t *mem_ptr);
+
+void cuda_compression_decompress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
+    int8_t *mem_ptr);
+
+void cleanup_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
 void scratch_cuda_integer_radix_bitop_kb_64(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
     uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -792,6 +820,37 @@ template <typename Torus> struct int_radix_lut {
   }
 };
 
+template <typename Torus> struct int_compression {
+  COMPRESSION_MODE mode;
+  int_radix_params params;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+
+  Torus *tmp_lwe_shifted;
+
+  int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                  uint32_t gpu_count, int_radix_params params,
+                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
+                  uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+                  bool allocate_gpu_memory) {
+    this->mode = mode;
+    this->params = params;
+    this->lwe_per_glwe = lwe_per_glwe;
+    this->storage_log_modulus = storage_log_modulus;
+
+    if (allocate_gpu_memory) {
+      int glwe_accumulator_size =
+          (params.glwe_dimension + 1) * params.polynomial_size;
+      tmp_lwe_shifted = (Torus *)cuda_malloc_async(
+          num_radix_blocks * (params.big_lwe_dimension + 1), streams[0],
+          gpu_indexes[0]);
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(tmp_lwe_shifted, streams[0], gpu_indexes[0]);
+  }
+};
 template <typename Torus> struct int_bit_extract_luts_buffer {
   int_radix_params params;
   int_radix_lut<Torus> *lut;

diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -16,6 +16,20 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+
+void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, uint32_t gpu_index,
+                                      void *glwe_array_out, void *lwe_array_in,
+                                      void *fp_ksk_array,
+                                      uint32_t input_lwe_dimension,
+                                      uint32_t output_glwe_dimension,
+                                      uint32_t output_polynomial_size,
+                                      uint32_t base_log, uint32_t level_count);
+
+void cuda_fp_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
 }
 
 #endif // CNCRT_KS_H_
diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,17 +1,3 @@
-set(SOURCES
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)

diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,11 +40,63 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t *>(lwe_output_indexes),
       static_cast<uint64_t *>(lwe_array_in),
       static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
       lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
+
+/* Perform functional packing keyswitch on a batch of 64 bits input LWE
+ * ciphertexts.
+ *
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `glwe_array_out`: output batch of keyswitched ciphertexts
+ * - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
+ * lwe_dimension_in mask values + 1 body value
+ *  - `fp_ksk_array`: the functional packing keyswitch keys to be used in the
+ * operation
+ *  - `base log`: the log of the base used in the decomposition (should be the
+ * one used to create the ksk)
+ *  - `level_count`: the number of levels used in the decomposition (should be
+ * the one used to  create the fp_ksks).
+ *  - `number_of_input_lwe`: the number of inputs
+ *  - `number_of_keys`: the number of fp_ksks
+ *
+ * This function calls a wrapper to a device kernel that performs the functional
+ * packing keyswitch.
+ */
+void cuda_fp_keyswitch_lwe_to_glwe_64(void *stream, uint32_t gpu_index,
+                                      void *glwe_array_out, void *lwe_array_in,
+                                      void *fp_ksk_array,
+                                      uint32_t input_lwe_dimension,
+                                      uint32_t output_glwe_dimension,
+                                      uint32_t output_polynomial_size,
+                                      uint32_t base_log, uint32_t level_count) {
+
+  host_fp_keyswitch_lwe_to_glwe(static_cast<cudaStream_t>(stream), gpu_index,
+                                static_cast<uint64_t *>(glwe_array_out),
+                                static_cast<uint64_t *>(lwe_array_in),
+                                static_cast<uint64_t *>(fp_ksk_array),
+                                input_lwe_dimension, output_glwe_dimension,
+                                output_polynomial_size, base_log, level_count);
+}
+
+void cuda_fp_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
+
+  host_fp_keyswitch_lwe_list_to_glwe(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(fp_ksk_array), input_lwe_dimension,
+      output_glwe_dimension, output_polynomial_size, base_log, level_count,
+      num_lwes);
+}