Skip to content

Commit

Permalink
feat(gpu): implement CompressedCudaCiphertextList, and public functio…
Browse files Browse the repository at this point in the history
…nal packing keyswitch
  • Loading branch information
pdroalves committed Jul 25, 2024
1 parent d3ea654 commit 9c29d8d
Show file tree
Hide file tree
Showing 33 changed files with 1,873 additions and 84 deletions.
59 changes: 59 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ enum COMPARISON_TYPE {
MIN = 7,
};

enum COMPRESSION_MODE {
COMPRESS = 0,
DECOMPRESS = 1,
};

enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
Expand Down Expand Up @@ -203,6 +208,29 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);

void scratch_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode,
bool allocate_gpu_memory);

void cuda_compression_compress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
int8_t *mem_ptr);

void cuda_compression_decompress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
int8_t *mem_ptr);

void cleanup_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_radix_bitop_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
Expand Down Expand Up @@ -792,6 +820,37 @@ template <typename Torus> struct int_radix_lut {
}
};

template <typename Torus> struct int_compression {
COMPRESSION_MODE mode;
int_radix_params params;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;

Torus *tmp_lwe_shifted;

int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, COMPRESSION_MODE mode,
bool allocate_gpu_memory) {
this->mode = mode;
this->params = params;
this->lwe_per_glwe = lwe_per_glwe;
this->storage_log_modulus = storage_log_modulus;

if (allocate_gpu_memory) {
int glwe_accumulator_size =
(params.glwe_dimension + 1) * params.polynomial_size;
tmp_lwe_shifted = (Torus *)cuda_malloc_async(
num_radix_blocks * (params.big_lwe_dimension + 1), streams[0],
gpu_indexes[0]);
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_lwe_shifted, streams[0], gpu_indexes[0]);
}
};
template <typename Torus> struct int_bit_extract_luts_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
Expand Down
14 changes: 14 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/keyswitch.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,20 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);

void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, uint32_t gpu_index,
void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array,
uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension,
uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count);

void cuda_fp_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
}

#endif // CNCRT_KS_H_
14 changes: 0 additions & 14 deletions backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
Expand Down
56 changes: 54 additions & 2 deletions backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
Expand Down Expand Up @@ -40,11 +40,63 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}

/* Perform functional packing keyswitch on a batch of 64 bits input LWE
* ciphertexts.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `glwe_array_out`: output batch of keyswitched ciphertexts
* - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
* lwe_dimension_in mask values + 1 body value
* - `fp_ksk_array`: the functional packing keyswitch keys to be used in the
* operation
* - `base log`: the log of the base used in the decomposition (should be the
* one used to create the ksk)
* - `level_count`: the number of levels used in the decomposition (should be
* the one used to create the fp_ksks).
* - `number_of_input_lwe`: the number of inputs
* - `number_of_keys`: the number of fp_ksks
*
* This function calls a wrapper to a device kernel that performs the functional
* packing keyswitch.
*/
void cuda_fp_keyswitch_lwe_to_glwe_64(void *stream, uint32_t gpu_index,
void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array,
uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension,
uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count) {

host_fp_keyswitch_lwe_to_glwe(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(fp_ksk_array),
input_lwe_dimension, output_glwe_dimension,
output_polynomial_size, base_log, level_count);
}

void cuda_fp_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {

host_fp_keyswitch_lwe_list_to_glwe(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(fp_ksk_array), input_lwe_dimension,
output_glwe_dimension, output_polynomial_size, base_log, level_count,
num_lwes);
}
Loading

0 comments on commit 9c29d8d

Please sign in to comment.