From 9c29d8d216449b3fc5ced63ee22513c898f205c9 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Wed, 17 Jul 2024 17:14:54 -0300 Subject: [PATCH] feat(gpu): implement CompressedCudaCiphertextList, and public functional packing keyswitch --- .../tfhe-cuda-backend/cuda/include/integer.h | 59 ++++ .../cuda/include/keyswitch.h | 14 + .../tfhe-cuda-backend/cuda/src/CMakeLists.txt | 14 - .../cuda/src/crypto/keyswitch.cu | 56 +++- .../cuda/src/crypto/keyswitch.cuh | 183 ++++++++++- .../cuda/src/crypto/torus.cuh | 31 +- .../src/integer/compression/compression.cu | 50 +++ .../src/integer/compression/compression.cuh | 65 ++++ .../cuda/src/linearalgebra/multiplication.cu | 24 +- .../cuda/src/linearalgebra/multiplication.cuh | 44 ++- .../pbs/programmable_bootstrap_amortized.cuh | 8 +- .../pbs/programmable_bootstrap_cg_classic.cuh | 6 +- .../programmable_bootstrap_cg_multibit.cuh | 4 +- .../pbs/programmable_bootstrap_classic.cuh | 9 +- .../pbs/programmable_bootstrap_multibit.cuh | 8 +- .../programmable_bootstrap_tbc_classic.cuh | 9 +- .../programmable_bootstrap_tbc_multibit.cuh | 4 +- .../cuda/src/polynomial/polynomial_math.cuh | 44 ++- backends/tfhe-cuda-backend/src/cuda_bind.rs | 101 +++++- .../gpu/algorithms/lwe_packing_keyswitch.rs | 62 ++++ tfhe/src/core_crypto/gpu/algorithms/mod.rs | 6 +- .../algorithms/test/lwe_packing_keyswitch.rs | 239 ++++++++++++++ .../core_crypto/gpu/algorithms/test/mod.rs | 9 + .../gpu/entities/lwe_ciphertext_list.rs | 63 +++- .../gpu/entities/lwe_packing_keyswitch_key.rs | 87 +++++ tfhe/src/core_crypto/gpu/entities/mod.rs | 1 + tfhe/src/core_crypto/gpu/mod.rs | 66 ++++ tfhe/src/integer/client_key/radix.rs | 32 ++ .../ciphertext/compressed_ciphertext_list.rs | 196 ++++++++++++ tfhe/src/integer/gpu/ciphertext/mod.rs | 1 + tfhe/src/integer/gpu/list_compression/mod.rs | 1 + .../gpu/list_compression/server_keys.rs | 300 ++++++++++++++++++ tfhe/src/integer/gpu/mod.rs | 161 +++++++++- 33 files changed, 1873 insertions(+), 84 deletions(-) create mode 100644 backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu create mode 100644 backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh create mode 100644 tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs create mode 100644 tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs create mode 100644 tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs create mode 100644 tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs create mode 100644 tfhe/src/integer/gpu/list_compression/mod.rs create mode 100644 tfhe/src/integer/gpu/list_compression/server_keys.rs diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index df4eb39df0..77515a641f 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -36,6 +36,11 @@ enum COMPARISON_TYPE { MIN = 7, }; +enum COMPRESSION_MODE { + COMPRESS = 0, + DECOMPRESS = 1, +}; + enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 }; enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 }; @@ -203,6 +208,29 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64( void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr_void); +void scratch_cuda_compression_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, + uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, + uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode, + bool allocate_gpu_memory); + +void cuda_compression_compress_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes, + int8_t *mem_ptr); + +void cuda_compression_decompress_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes, + int8_t *mem_ptr); + +void cleanup_cuda_compression_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void); + void scratch_cuda_integer_radix_bitop_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -792,6 +820,37 @@ template struct int_radix_lut { } }; +template struct int_compression { + COMPRESSION_MODE mode; + int_radix_params params; + uint32_t storage_log_modulus; + uint32_t lwe_per_glwe; + + Torus *tmp_lwe_shifted; + + int_compression(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params params, + uint32_t num_radix_blocks, uint32_t lwe_per_glwe, + uint32_t storage_log_modulus, COMPRESSION_MODE mode, + bool allocate_gpu_memory) { + this->mode = mode; + this->params = params; + this->lwe_per_glwe = lwe_per_glwe; + this->storage_log_modulus = storage_log_modulus; + + if (allocate_gpu_memory) { + int glwe_accumulator_size = + (params.glwe_dimension + 1) * params.polynomial_size; + tmp_lwe_shifted = (Torus *)cuda_malloc_async( + num_radix_blocks * (params.big_lwe_dimension + 1), streams[0], + gpu_indexes[0]); + } + } + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(tmp_lwe_shifted, streams[0], gpu_indexes[0]); + } +}; template struct int_bit_extract_luts_buffer { int_radix_params params; int_radix_lut *lut; diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h index 924ec131f9..08edc2b8e5 100644 --- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h +++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h @@ -16,6 +16,20 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples); + +void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, uint32_t gpu_index, + void *glwe_array_out, void *lwe_array_in, + void *fp_ksk_array, + uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, + uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count); + +void cuda_fp_keyswitch_lwe_list_to_glwe_64( + void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in, + void *fp_ksk_array, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t num_lwes); } #endif // CNCRT_KS_H_ diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt index 3190501fc5..5fec699fbc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt +++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt @@ -1,17 +1,3 @@ -set(SOURCES - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h) file(GLOB_RECURSE SOURCES "*.cu") add_library(tfhe_cuda_backend STATIC ${SOURCES}) set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu index 524a1fa45e..5d890a6a6e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu @@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { - cuda_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), @@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { - cuda_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), @@ -48,3 +48,55 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(ksk), lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); } + +/* Perform functional packing keyswitch on a batch of 64 bits input LWE + * ciphertexts. + * + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - `glwe_array_out`: output batch of keyswitched ciphertexts + * - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing + * lwe_dimension_in mask values + 1 body value + * - `fp_ksk_array`: the functional packing keyswitch keys to be used in the + * operation + * - `base log`: the log of the base used in the decomposition (should be the + * one used to create the ksk) + * - `level_count`: the number of levels used in the decomposition (should be + * the one used to create the fp_ksks). + * - `number_of_input_lwe`: the number of inputs + * - `number_of_keys`: the number of fp_ksks + * + * This function calls a wrapper to a device kernel that performs the functional + * packing keyswitch. + */ +void cuda_fp_keyswitch_lwe_to_glwe_64(void *stream, uint32_t gpu_index, + void *glwe_array_out, void *lwe_array_in, + void *fp_ksk_array, + uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, + uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count) { + + host_fp_keyswitch_lwe_to_glwe(static_cast(stream), gpu_index, + static_cast(glwe_array_out), + static_cast(lwe_array_in), + static_cast(fp_ksk_array), + input_lwe_dimension, output_glwe_dimension, + output_polynomial_size, base_log, level_count); +} + +void cuda_fp_keyswitch_lwe_list_to_glwe_64( + void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in, + void *fp_ksk_array, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t num_lwes) { + + host_fp_keyswitch_lwe_list_to_glwe( + static_cast(stream), gpu_index, + static_cast(glwe_array_out), + static_cast(lwe_array_in), + static_cast(fp_ksk_array), input_lwe_dimension, + output_glwe_dimension, output_polynomial_size, base_log, level_count, + num_lwes); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index db78104a98..53f4d667b0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -7,6 +7,7 @@ #include "polynomial/functions.cuh" #include "polynomial/polynomial_math.cuh" #include "torus.cuh" +#include "utils/helper.cuh" #include "utils/kernel_dimensions.cuh" #include #include @@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, } template -__host__ void cuda_keyswitch_lwe_ciphertext_vector( +__host__ void host_keyswitch_lwe_ciphertext_vector( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, @@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes, GET_VARIANT_ELEMENT(lwe_input_indexes, i); // Compute Keyswitch - cuda_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( streams[i], gpu_indexes[i], current_lwe_array_out, current_lwe_output_indexes, current_lwe_array_in, current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out, @@ -154,4 +155,182 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes, } } +// chunk_count = glwe_size * polynomial_size / threads. +// each threads will responsible to process only lwe_size times multiplication +template +__device__ void keyswitch_lwe_ciphertext_into_glwe_ciphertext( + Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count) { + + extern __shared__ int8_t sharedmem[]; + + // result accumulator, shared memory is used because of frequent access + Torus *local_glwe_chunk = (Torus *)sharedmem; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int shmem_index = threadIdx.x; + // the output_glwe is split in chunks and each x-block takes one of them + size_t chunk_id = blockIdx.x; + size_t coef_per_block = blockDim.x; + + // dimensions + size_t glwe_size = (glwe_dimension + 1); + size_t lwe_size = (lwe_dimension_in + 1); + size_t ksk_size = lwe_size * level_count * glwe_size * polynomial_size; + // number of coefficients inside fp-ksk block for each lwe_input coefficient + size_t ksk_block_size = glwe_size * polynomial_size * level_count; + + // initialize accumulator to 0 + local_glwe_chunk[shmem_index] = + SEL(0, lwe_in[lwe_dimension_in], tid == glwe_dimension * polynomial_size); + + // Iterate through all lwe elements + for (int i = 0; i < lwe_dimension_in; i++) { + // Round and prepare decomposition + Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count); + + Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count); + Torus mod_b_mask = (1ll << base_log) - 1ll; + + // block of key for current lwe coefficient (cur_input_lwe[i]) + auto ksk_block = &fp_ksk[i * ksk_block_size]; + for (int j = 0; j < level_count; j++) { + auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size]; + // Iterate through each level and multiply by the ksk piece + auto ksk_glwe_chunk = &ksk_glwe[chunk_id * coef_per_block]; + Torus decomposed = decompose_one(state, mod_b_mask, base_log); + local_glwe_chunk[shmem_index] -= decomposed * ksk_glwe_chunk[shmem_index]; + } + } + + // Persist + glwe_out[tid] = local_glwe_chunk[shmem_index]; +} + +// public functional packing keyswitch +// +// chunk_count = glwe_size * polynomial_size / threads. +template +__global__ void +fp_keyswitch_lwe_to_glwe(Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, + uint32_t lwe_dimension_in, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count) { + + keyswitch_lwe_ciphertext_into_glwe_ciphertext( + glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension, + polynomial_size, base_log, level_count); +} + +template +__host__ void +host_fp_keyswitch_lwe_to_glwe(cudaStream_t stream, uint32_t gpu_index, + Torus *glwe_array_out, Torus *lwe_array_in, + Torus *fp_ksk_array, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count) { + cudaSetDevice(gpu_index); + int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; + + int num_threads = 0, num_blocks = 0; + getNumBlocksAndThreads(glwe_accumulator_size, 512, num_blocks, num_threads); + int shared_mem = sizeof(Torus) * num_threads; + fp_keyswitch_lwe_to_glwe<<>>( + glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in, + glwe_dimension, polynomial_size, base_log, level_count); + check_cuda_error(cudaGetLastError()); +} + +// public functional packing keyswitch +// +// blockIdx.y - input +// chunk_count = glwe_size * polynomial_size / threads. +template +__global__ void +fp_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in, + Torus *fp_ksk, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, + uint32_t num_samples, Torus *d_mem) { + int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; + + const int input_id = threadIdx.y + blockIdx.y * blockDim.y; + const int degree = input_id; + + // Select a input + auto input_d_mem = d_mem + input_id * glwe_accumulator_size; + + // KS LWE to GLWE + auto ks_glwe_out = input_d_mem; + auto lwe_in = lwe_array_in + (lwe_dimension_in + 1) * input_id; + keyswitch_lwe_ciphertext_into_glwe_ciphertext( + ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension, + polynomial_size, base_log, level_count); + + // P * x ^degree + auto glwe_out = + glwe_array_out + (glwe_dimension + 1) * polynomial_size * input_id; + polynomial_accumulate_monic_monomial_mul_batch( + glwe_out, ks_glwe_out, degree, polynomial_size, glwe_dimension + 1, true); +} + +template +__global__ void +accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t num_samples) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + extern __shared__ int8_t sharedmem[]; + const int shmem_index = threadIdx.x; + + Torus *glwe_acc = (Torus *)sharedmem; + glwe_acc[shmem_index] = glwe_array_in[tid]; + + // Accumulate + for (int i = 1; i < num_samples; i++) { + auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size; + glwe_acc[shmem_index] += glwe_in[tid]; + } + + glwe_out[tid] = glwe_acc[shmem_index]; +} + +template +__host__ void host_fp_keyswitch_lwe_list_to_glwe( + cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out, + Torus *lwe_array_in, Torus *fp_ksk_array, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t num_samples) { + cudaSetDevice(gpu_index); + int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; + + int num_blocks = 0, num_threads_x = 0; + getNumBlocksAndThreads(glwe_accumulator_size, 512, num_blocks, num_threads_x); + + auto shared_mem = sizeof(Torus) * num_threads_x; + dim3 grid(num_blocks, num_samples); + dim3 threads(num_threads_x); + + auto d_mem = (Torus *)cuda_malloc_async( + num_samples * glwe_accumulator_size * sizeof(Torus), stream, gpu_index); + auto d_tmp_glwe_array_out = (Torus *)cuda_malloc_async( + num_samples * glwe_accumulator_size * sizeof(Torus), stream, gpu_index); + + // individually keyswitch each lwe + fp_keyswitch_lwe_list_to_glwe<<>>( + d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in, + glwe_dimension, polynomial_size, base_log, level_count, num_samples, + d_mem); + check_cuda_error(cudaGetLastError()); + // accumulate to a single glwe + accumulate_glwes<<>>( + glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size, + num_samples); + check_cuda_error(cudaGetLastError()); + + cuda_drop_async(d_mem, stream, gpu_index); + cuda_drop_async(d_tmp_glwe_array_out, stream, gpu_index); +} + #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh index 9fc4ad1d8a..e5ba7e6b88 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh @@ -3,6 +3,7 @@ #include "device.h" #include "types/int128.cuh" +#include "utils/kernel_dimensions.cuh" #include template @@ -40,8 +41,8 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log, } template -__device__ __forceinline__ void modulus_switch(T input, T &output, - uint32_t log_modulus) { +__device__ __forceinline__ void apply_modulus_switch(T input, T &output, + uint32_t log_modulus) { constexpr uint32_t BITS = sizeof(T) * 8; output = input + (((T)1) << (BITS - log_modulus - 1)); @@ -49,10 +50,32 @@ __device__ __forceinline__ void modulus_switch(T input, T &output, } template -__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) { +__device__ __forceinline__ T apply_modulus_switch(T input, + uint32_t log_modulus) { T output; - modulus_switch(input, output, log_modulus); + apply_modulus_switch(input, output, log_modulus); return output; } +template +__global__ void apply_modulus_switch_inplace(Torus *array, int size, + uint32_t log_modulus) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < size) { + array[tid] = apply_modulus_switch(array[tid], log_modulus); + } +} + +template +__host__ void modulus_switch(cudaStream_t stream, uint32_t gpu_index, + Torus *array, int size, uint32_t log_modulus) { + + int num_threads = 0, num_blocks = 0; + getNumBlocksAndThreads(size, 1024, num_blocks, num_threads); + + apply_modulus_switch_inplace<<>>( + array, size, log_modulus); + check_cuda_error(cudaGetLastError()); +} + #endif // CNCRT_TORUS_H diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu new file mode 100644 index 0000000000..d0c2dd69d9 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu @@ -0,0 +1,50 @@ +#include "compression.cuh" + +void scratch_cuda_compression_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension, + uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, + uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode, + bool allocate_gpu_memory) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + glwe_dimension * polynomial_size, lwe_dimension, + ks_level, ks_base_log, pbs_level, pbs_base_log, + grouping_factor, message_modulus, carry_modulus); + + scratch_cuda_compression_integer_radix_ciphertext_64( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_compression **)mem_ptr, num_lwes, params, lwe_per_glwe, + storage_log_modulus, mode, allocate_gpu_memory); +} +void cuda_compression_compress_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes, + int8_t *mem_ptr) { + + host_integer_compression_compress( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(glwe_array_out), + static_cast(lwe_array_in), (uint64_t **)(fp_ksk), num_lwes, + (int_compression *)mem_ptr); +} +void cuda_compression_decompress_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_out, + void *glwe_array_in, void **bsks, int8_t *mem_ptr) { + + host_integer_compression_decompress( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lwe_out), static_cast(glwe_array_in), + bsks, (int_compression *)mem_ptr); +} + +void cleanup_cuda_compression_integer_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void) { + + int_compression *mem_ptr = + (int_compression *)(*mem_ptr_void); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh new file mode 100644 index 0000000000..a7a85b31ed --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -0,0 +1,65 @@ +#ifndef CUDA_INTEGER_COMPRESSION_CUH +#define CUDA_INTEGER_COMPRESSION_CUH + +#include "crypto/keyswitch.cuh" +#include "device.h" +#include "integer.h" +#include "integer/integer.cuh" +#include "linearalgebra/multiplication.cuh" +#include "polynomial/functions.cuh" +#include "utils/kernel_dimensions.cuh" + +template +__host__ void host_integer_compression_compress( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *glwe_array_out, Torus *lwe_array_in, Torus **fp_ksk, + uint32_t num_lwes, int_compression *mem_ptr) { + auto params = mem_ptr->params; + + // Shift + auto lwe_shifted = mem_ptr->tmp_lwe_shifted; + host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted, + lwe_array_in, (uint64_t)params.message_modulus, + params.big_lwe_dimension, num_lwes); + + uint32_t lwe_in_size = params.big_lwe_dimension + 1; + uint32_t glwe_out_size = (params.glwe_dimension + 1) * params.polynomial_size; + uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe; + + // Keyswitch LWEs to GLWE + for (int i = 0; i < num_glwes; i++) { + auto lwe_subset = lwe_shifted + i * lwe_in_size; + auto glwe_out = glwe_array_out + i * glwe_out_size; + + host_fp_keyswitch_lwe_list_to_glwe( + streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0], + params.big_lwe_dimension, params.glwe_dimension, params.polynomial_size, + params.ks_base_log, params.ks_level, mem_ptr->lwe_per_glwe); + } + + // Modulus switch + int num_blocks = 0, num_threads = 0; + getNumBlocksAndThreads(glwe_out_size, 512, num_blocks, num_threads); + apply_modulus_switch_inplace<<>>( + glwe_array_out, num_glwes * glwe_out_size, mem_ptr->storage_log_modulus); +} + +template +__host__ void host_integer_compression_decompress( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *lwe_out, Torus *glwe_array_in, void **bsks, + int_compression *mem_ptr) {} + +template +__host__ void scratch_cuda_compression_integer_radix_ciphertext_64( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_compression **mem_ptr, uint32_t num_lwes, + int_radix_params params, uint32_t lwe_per_glwe, + uint32_t storage_log_modulus, COMPRESSION_MODE mode, + bool allocate_gpu_memory) { + + *mem_ptr = new int_compression( + streams, gpu_indexes, gpu_count, params, num_lwes, lwe_per_glwe, + storage_log_modulus, mode, allocate_gpu_memory); +} +#endif diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu index 2a936d7cdf..a64c15378d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu @@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( void *cleartext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_cleartext_multiplication(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(cleartext_array_in), - input_lwe_dimension, - input_lwe_ciphertext_count); + host_cleartext_vec_multiplication( + static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); } /* * Perform the multiplication of a u64 input LWE ciphertext vector with a u64 @@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( void *cleartext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_cleartext_multiplication(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(cleartext_array_in), - input_lwe_dimension, - input_lwe_ciphertext_count); + host_cleartext_vec_multiplication( + static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh index 9fd9be9419..c02e9236f1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh @@ -13,9 +13,47 @@ #include #include +template +__global__ void cleartext_vec_multiplication(T *output, T *lwe_input, + T *cleartext_input, + uint32_t input_lwe_dimension, + uint32_t num_entries) { + + int tid = threadIdx.x; + int index = blockIdx.x * blockDim.x + tid; + if (index < num_entries) { + int cleartext_index = index / (input_lwe_dimension + 1); + // Here we take advantage of the wrapping behaviour of uint + output[index] = lwe_input[index] * cleartext_input[cleartext_index]; + } +} + +template +__host__ void +host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index, + T *output, T *lwe_input, T *cleartext_input, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + cudaSetDevice(gpu_index); + // lwe_size includes the presence of the body + // whereas lwe_dimension is the number of elements in the mask + int lwe_size = input_lwe_dimension + 1; + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = input_lwe_ciphertext_count * lwe_size; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + cleartext_vec_multiplication<<>>( + output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); + check_cuda_error(cudaGetLastError()); +} + template __global__ void -cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input, +cleartext_multiplication(T *output, T *lwe_input, T cleartext_input, uint32_t input_lwe_dimension, uint32_t num_entries) { int tid = threadIdx.x; @@ -23,14 +61,14 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input, if (index < num_entries) { int cleartext_index = index / (input_lwe_dimension + 1); // Here we take advantage of the wrapping behaviour of uint - output[index] = lwe_input[index] * cleartext_input[cleartext_index]; + output[index] = lwe_input[index] * cleartext_input; } } template __host__ void host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index, - T *output, T *lwe_input, T *cleartext_input, + T *output, T *lwe_input, T cleartext_input, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index 668883e044..b1ef7f087b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -89,8 +89,8 @@ __global__ void device_programmable_bootstrap_amortized( // Put "b", the body, in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( @@ -105,8 +105,8 @@ __global__ void device_programmable_bootstrap_amortized( // Put "a" in [0, 2N[ instead of Zq Torus a_hat = 0; - modulus_switch(block_lwe_array_in[iteration], a_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[iteration], a_hat, + params::log2_degree + 1); // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 3b0eecce7b..c47febd6a8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -93,8 +93,8 @@ __global__ void device_programmable_bootstrap_cg( // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( @@ -106,7 +106,7 @@ __global__ void device_programmable_bootstrap_cg( // Put "a" in [0, 2N[ Torus a_hat = 0; - modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1); // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index d8e17c936f..27d398cf99 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -79,8 +79,8 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( if (lwe_offset == 0) { // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 8a60c6b778..4c315192d4 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -74,8 +74,8 @@ __global__ void device_programmable_bootstrap_step_one( // First iteration // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); // The y-dimension is used to select the element of the GLWE this block will // compute divide_by_monomial_negacyclic_inplace @@ -102,7 +102,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( synchronize_threads_in_block(); // Multiply by the bsk element - polynomial_product_accumulate_by_monomial( + polynomial_accumulate_monic_monomial_mul( accumulator, bsk_poly, monomial_degree, false); } @@ -203,8 +203,8 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( // Initializes the accumulator with the body of LWE // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index b19cde44d9..abd35ee8fe 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -96,8 +96,8 @@ __global__ void device_programmable_bootstrap_tbc( // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( @@ -109,8 +109,9 @@ __global__ void device_programmable_bootstrap_tbc( // Put "a" in [0, 2N[ Torus a_hat = 0; - modulus_switch(block_lwe_array_in[i], a_hat, - params::log2_degree + 1); // 2 * params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[i], a_hat, + params::log2_degree + + 1); // 2 * params::log2_degree + 1); // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index fd143e494e..49087e92a6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -86,8 +86,8 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( if (lwe_offset == 0) { // Put "b" in [0, 2N[ Torus b_hat = 0; - modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, - params::log2_degree + 1); + apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh index a6d7901f20..8e3b2d9c94 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh @@ -55,13 +55,14 @@ __device__ void polynomial_product_accumulate_in_fourier_domain( } } -// If init_accumulator is set, assumes that result was not initialized and does -// that with the outcome of first * second +// This method expects to work with polynomial_size / params::opt threads in the +// x-block If init_accumulator is set, assumes that result was not initialized +// and does that with the outcome of first * second template __device__ void -polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly, - uint64_t monomial_degree, - bool init_accumulator = false) { +polynomial_accumulate_monic_monomial_mul(T *result, const T *__restrict__ poly, + uint64_t monomial_degree, + bool init_accumulator = false) { // monomial_degree \in [0, 2 * params::degree) int full_cycles_count = monomial_degree / params::degree; int remainder_degrees = monomial_degree % params::degree; @@ -82,4 +83,37 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly, } } +// This method expects to work with num_poly * polynomial_size threads in the +// grid +template +__device__ void polynomial_accumulate_monic_monomial_mul_batch( + T *result_array, T *poly_array, uint64_t monomial_degree, + uint32_t polynomial_size, uint32_t num_poly, + bool init_accumulator = false) { + // monomial_degree \in [0, 2 * params::degree) + int full_cycles_count = monomial_degree / polynomial_size; + int remainder_degrees = monomial_degree % polynomial_size; + + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + int pos = tid % polynomial_size; + + // Select a input + auto poly = poly_array + (tid / polynomial_size) * polynomial_size; + auto result = result_array + (tid / polynomial_size) * polynomial_size; + + // Calculate the rotation + T element = poly[pos]; + int new_pos = (pos + monomial_degree) % polynomial_size; + + // Calculate the new coefficient + T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient + x = SEL(-x, x, new_pos >= remainder_degrees); + + // Write result + if (init_accumulator) + result[new_pos] = x; + else + result[new_pos] += x; +} + #endif // CNCRT_POLYNOMIAL_MATH_H diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index a0f0d317d5..2ced0e87e1 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -342,6 +342,33 @@ extern "C" { num_samples: u32, ); + pub fn cuda_fp_keyswitch_lwe_to_glwe_64( + stream: *mut c_void, + gpu_index: u32, + glwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + fp_ksk_array: *const c_void, + input_lwe_dimension: u32, + output_glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + ); + + pub fn cuda_fp_keyswitch_lwe_list_to_glwe_64( + stream: *mut c_void, + gpu_index: u32, + glwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + fp_ksk_array: *const c_void, + input_lwe_dimension: u32, + output_glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + num_lwes: u32, + ); + /// Perform the negation of a u64 input LWE ciphertext vector. /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch /// - `gpu_index` is the index of the GPU to be used in the kernel launch @@ -516,6 +543,69 @@ extern "C" { mem_ptr: *mut *mut i8, ); + pub fn scratch_cuda_compression_integer_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_lwes: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: u32, + lwe_per_glwe: u32, + storage_log_modulus: u32, + mode: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_compression_compress_integer_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + glwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + fp_ksk: *const *mut c_void, + num_lwes: u32, + mem_ptr: *mut i8, + ); + + pub fn cuda_compression_decompress_integer_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_out: *mut c_void, + glwe_array_in: *const c_void, + bsks: *const *mut c_void, + mem_ptr: *mut i8, + ); + + pub fn cleanup_cuda_compression_integer_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); + + pub fn cuda_glwe_sample_extract_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut c_void, + glwe_in: *const c_void, + nth_array: *const u32, + num_samples: u32, + glwe_dimension: u32, + polynomial_size: u32, + ); + pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( streams: *const *mut c_void, gpu_indexes: *const u32, @@ -628,17 +718,6 @@ extern "C" { gpu_count: u32, mem_ptr: *mut *mut i8, ); - pub fn cuda_glwe_sample_extract_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array_out: *mut c_void, - glwe_in: *const c_void, - nth_array: *const u32, - num_samples: u32, - glwe_dimension: u32, - polynomial_size: u32, - ); pub fn scratch_cuda_integer_radix_comparison_kb_64( streams: *const *mut c_void, diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs new file mode 100644 index 0000000000..5b4565cf1e --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs @@ -0,0 +1,62 @@ +use crate::core_crypto::gpu::glwe_ciphertext::CudaGlweCiphertext; +use crate::core_crypto::gpu::lwe_ciphertext::CudaLweCiphertext; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; +use crate::core_crypto::gpu::{packing_keyswitch_async, packing_keyswitch_list_async, CudaStreams}; +use crate::core_crypto::prelude::{CastInto, UnsignedTorus}; + +pub fn cuda_keyswitch_lwe_ciphertext_into_glwe_ciphertext( + lwe_pksk: &CudaLwePackingKeyswitchKey, + input_lwe_ciphertext: &CudaLweCiphertext, + output_glwe_ciphertext: &mut CudaGlweCiphertext, + streams: &CudaStreams, +) where + // CastInto required for PBS modulus switch which returns a usize + Scalar: UnsignedTorus + CastInto, +{ + let input_lwe_dimension = input_lwe_ciphertext.lwe_dimension(); + let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension(); + let output_polynomial_size = output_glwe_ciphertext.polynomial_size(); + + unsafe { + packing_keyswitch_async( + streams, + &mut output_glwe_ciphertext.0.d_vec, + &input_lwe_ciphertext.0.d_vec, + input_lwe_dimension, + output_glwe_dimension, + output_polynomial_size, + &lwe_pksk.d_vec, + lwe_pksk.decomposition_base_log(), + lwe_pksk.decomposition_level_count(), + ); + } +} +pub fn cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext( + lwe_pksk: &CudaLwePackingKeyswitchKey, + input_lwe_ciphertext_list: &CudaLweCiphertextList, + output_glwe_ciphertext: &mut CudaGlweCiphertext, + streams: &CudaStreams, +) where + // CastInto required for PBS modulus switch which returns a usize + Scalar: UnsignedTorus + CastInto, +{ + let input_lwe_dimension = input_lwe_ciphertext_list.lwe_dimension(); + let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension(); + let output_polynomial_size = output_glwe_ciphertext.polynomial_size(); + + unsafe { + packing_keyswitch_list_async( + streams, + &mut output_glwe_ciphertext.0.d_vec, + &input_lwe_ciphertext_list.0.d_vec, + input_lwe_dimension, + output_glwe_dimension, + output_polynomial_size, + &lwe_pksk.d_vec, + lwe_pksk.decomposition_base_log(), + lwe_pksk.decomposition_level_count(), + input_lwe_ciphertext_list.lwe_ciphertext_count(), + ); + } +} diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs index ee6ddd2b1d..489bbb1565 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs @@ -1,13 +1,15 @@ +pub mod glwe_sample_extraction; +pub mod lwe_keyswitch; pub mod lwe_linear_algebra; pub mod lwe_multi_bit_programmable_bootstrapping; +pub mod lwe_packing_keyswitch; pub mod lwe_programmable_bootstrapping; -pub mod glwe_sample_extraction; -mod lwe_keyswitch; #[cfg(test)] mod test; pub use lwe_keyswitch::*; pub use lwe_linear_algebra::*; pub use lwe_multi_bit_programmable_bootstrapping::*; +pub use lwe_packing_keyswitch::*; pub use lwe_programmable_bootstrapping::*; diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs new file mode 100644 index 0000000000..ec5f309f69 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs @@ -0,0 +1,239 @@ +use super::*; +use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext; +use crate::core_crypto::gpu::glwe_ciphertext::CudaGlweCiphertext; +use crate::core_crypto::gpu::lwe_ciphertext::CudaLweCiphertext; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::{cuda_keyswitch_lwe_ciphertext_into_glwe_ciphertext, CudaStreams}; +use serde::de::DeserializeOwned; +use serde::Serialize; + +const NB_TESTS: usize = 10; +fn generate_keys( + params: PackingKeySwitchTestParams, + streams: &CudaStreams, + rsc: &mut TestResources, +) -> CudaPackingKeySwitchKeys { + let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key( + params.lwe_dimension, + &mut rsc.secret_random_generator, + ); + + let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key( + params.glwe_dimension, + params.polynomial_size, + &mut rsc.secret_random_generator, + ); + + let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key( + &lwe_sk, + &glwe_sk, + params.pbs_base_log, + params.pbs_level, + params.glwe_noise_distribution, + params.ciphertext_modulus, + &mut rsc.encryption_random_generator, + ); + + assert!(check_encrypted_content_respects_mod( + &pksk, + params.ciphertext_modulus + )); + + let cuda_pksk = CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(&pksk, &streams); + + CudaPackingKeySwitchKeys { + lwe_sk, + glwe_sk, + pksk: cuda_pksk, + h_pksk: pksk, + } +} + +fn lwe_encrypt_pks_to_glwe_decrypt_custom_mod(params: P) +where + Scalar: UnsignedTorus + CastInto + Serialize + DeserializeOwned, + P: Into>, + PackingKeySwitchTestParams: KeyCacheAccess>, +{ + let params = params.into(); + + let lwe_noise_distribution = params.lwe_noise_distribution; + let ciphertext_modulus = params.ciphertext_modulus; + let message_modulus_log = params.message_modulus_log; + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + + let mut rsc = TestResources::new(); + + let msg_modulus = Scalar::ONE.shl(message_modulus_log.0); + let mut msg = msg_modulus; + let delta: Scalar = encoding_with_padding / msg_modulus; + + let gpu_index = 0; + let stream = CudaStreams::new_single_gpu(gpu_index); + + while msg != Scalar::ZERO { + msg = msg.wrapping_sub(Scalar::ONE); + for _ in 0..NB_TESTS { + let keys = generate_keys(params, &stream, &mut rsc); + let (h_pksk, pksk, lwe_sk, glwe_sk) = + (keys.h_pksk, keys.pksk, keys.lwe_sk, keys.glwe_sk); + + let plaintext = Plaintext(msg * delta); + + let input_lwe = allocate_and_encrypt_new_lwe_ciphertext( + &lwe_sk, + plaintext, + lwe_noise_distribution, + ciphertext_modulus, + &mut rsc.encryption_random_generator, + ); + + let d_input_lwe = CudaLweCiphertext::from_lwe_ciphertext(&input_lwe, &stream); + + assert!(check_encrypted_content_respects_mod( + &input_lwe, + ciphertext_modulus + )); + + let mut d_output_glwe = CudaGlweCiphertext::new( + glwe_sk.glwe_dimension(), + glwe_sk.polynomial_size(), + ciphertext_modulus, + &stream, + ); + + cuda_keyswitch_lwe_ciphertext_into_glwe_ciphertext( + &pksk, + &d_input_lwe, + &mut d_output_glwe, + &stream, + ); + + let output_glwe = d_output_glwe.to_glwe_ciphertext(&stream); + + let mut decrypted_plaintext_list = PlaintextList::new( + Scalar::ZERO, + PlaintextCount(output_glwe.polynomial_size().0), + ); + + decrypt_glwe_ciphertext(&glwe_sk, &output_glwe, &mut decrypted_plaintext_list); + + let decoded = round_decode(*decrypted_plaintext_list.get(0).0, delta) % msg_modulus; + + assert_eq!(msg, decoded); + } + + // In coverage, we break after one while loop iteration, changing message values does not + // yield higher coverage + #[cfg(tarpaulin)] + break; + } +} + +fn lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod(params: P) +where + Scalar: UnsignedTorus + CastInto + Serialize + DeserializeOwned, + P: Into>, + PackingKeySwitchTestParams: KeyCacheAccess>, +{ + let params = params.into(); + + let lwe_noise_distribution = params.lwe_noise_distribution; + let ciphertext_modulus = params.ciphertext_modulus; + let message_modulus_log = params.message_modulus_log; + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + + let mut rsc = TestResources::new(); + + let msg_modulus = Scalar::ONE.shl(message_modulus_log.0); + let mut msg = msg_modulus; + let delta: Scalar = encoding_with_padding / msg_modulus; + + let gpu_index = 0; + let stream = CudaStreams::new_single_gpu(gpu_index); + + // while msg != Scalar::ZERO { + msg = msg.wrapping_sub(Scalar::ONE); + for _ in 0..NB_TESTS { + let keys = generate_keys(params, &stream, &mut rsc); + let (h_pksk, pksk, lwe_sk, glwe_sk) = (keys.h_pksk, keys.pksk, keys.lwe_sk, keys.glwe_sk); + + let mut input_lwe_list = LweCiphertextList::new( + Scalar::ZERO, + lwe_sk.lwe_dimension().to_lwe_size(), + LweCiphertextCount(glwe_sk.polynomial_size().0), + ciphertext_modulus, + ); + + let mut input_plaintext_list = + PlaintextList::new(msg * delta, PlaintextCount(glwe_sk.polynomial_size().0)); + + encrypt_lwe_ciphertext_list( + &lwe_sk, + &mut input_lwe_list, + &input_plaintext_list, + lwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + + let d_input_lwe_list = + CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream); + + assert!(check_encrypted_content_respects_mod( + &input_lwe_list, + ciphertext_modulus + )); + + let mut d_output_glwe = CudaGlweCiphertext::new( + glwe_sk.glwe_dimension(), + glwe_sk.polynomial_size(), + ciphertext_modulus, + &stream, + ); + + cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext( + &pksk, + &d_input_lwe_list, + &mut d_output_glwe, + &stream, + ); + + let mut reference_output_glwe = GlweCiphertext::new( + Scalar::ZERO, + glwe_sk.glwe_dimension().to_glwe_size(), + glwe_sk.polynomial_size(), + ciphertext_modulus, + ); + + keyswitch_lwe_ciphertext_list_and_pack_in_glwe_ciphertext( + &h_pksk, + &input_lwe_list, + &mut reference_output_glwe, + ); + + let output_glwe = d_output_glwe.to_glwe_ciphertext(&stream); + + let mut decrypted_plaintext_list = PlaintextList::new( + Scalar::ZERO, + PlaintextCount(output_glwe.polynomial_size().0), + ); + + decrypt_glwe_ciphertext(&glwe_sk, &output_glwe, &mut decrypted_plaintext_list); + + decrypted_plaintext_list + .iter_mut() + .for_each(|x| *x.0 = round_decode(*x.0, delta) % msg_modulus); + input_plaintext_list.iter_mut().for_each(|x| *x.0 /= delta); + + assert_eq!(decrypted_plaintext_list, input_plaintext_list); + } + + // In coverage, we break after one while loop iteration, changing message values does not + // yield higher coverage + #[cfg(tarpaulin)] + break; + // } +} + +create_gpu_parametrized_test!(lwe_encrypt_pks_to_glwe_decrypt_custom_mod); +create_gpu_parametrized_test!(lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod); diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs index 59ba20af18..7235d3e1ac 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs @@ -4,8 +4,16 @@ mod glwe_sample_extraction; mod lwe_keyswitch; mod lwe_linear_algebra; mod lwe_multi_bit_programmable_bootstrapping; +mod lwe_packing_keyswitch; mod lwe_programmable_bootstrapping; +pub struct CudaPackingKeySwitchKeys { + pub lwe_sk: LweSecretKey>, + pub glwe_sk: GlweSecretKey>, + pub pksk: CudaLwePackingKeyswitchKey, + pub h_pksk: LwePackingKeyswitchKeyOwned, +} + // Macro to generate tests for all parameter sets macro_rules! create_gpu_parametrized_test{ ($name:ident { $($param:ident),* }) => { @@ -47,4 +55,5 @@ macro_rules! create_gpu_multi_bit_parametrized_test{ }; } +use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; use {create_gpu_multi_bit_parametrized_test, create_gpu_parametrized_test}; diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs index 9defa579f6..9fc0fad460 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs @@ -4,6 +4,7 @@ use crate::core_crypto::prelude::{ CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList, LweDimension, LweSize, UnsignedInteger, }; +use std::cmp::min; use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu; /// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU. @@ -229,11 +230,7 @@ impl CudaLweCiphertextList { let ciphertext_modulus = self.ciphertext_modulus(); // Copy to the GPU - let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams, 0); - unsafe { - d_vec.copy_from_gpu_async(&self.0.d_vec, streams, 0); - } - streams.synchronize(); + let d_vec = unsafe { self.0.d_vec.duplicate(streams, 0) }; let cuda_lwe_list = CudaLweList { d_vec, @@ -244,6 +241,62 @@ impl CudaLweCiphertextList { Self(cuda_lwe_list) } + // Retrieve [start,end) lwe ciphertexts in the list + pub fn retrieve_subset( + &self, + start: usize, + end: usize, + streams: &CudaStreams, + gpu_index: u32, + ) -> Option { + if end < start { + None + } else { + let lwe_dimension = self.lwe_dimension(); + let lwe_ciphertext_count = + LweCiphertextCount(min(end - start, self.lwe_ciphertext_count().0)); + let ciphertext_modulus = self.ciphertext_modulus(); + + // Copy to the GPU + let d_vec = unsafe { + let mut d_vec = CudaVec::new_async( + lwe_ciphertext_count.0 * lwe_dimension.to_lwe_size().0, + streams, + gpu_index, + ); + // Todo: We might use copy_src_range_gpu_to_gpu_async here + let src_ptr = self + .0 + .d_vec + .as_c_ptr(gpu_index) + .add(start * lwe_dimension.to_lwe_size().0 * std::mem::size_of::()); + let size = lwe_ciphertext_count.0 + * lwe_dimension.to_lwe_size().0 + * std::mem::size_of::(); + cuda_memcpy_async_gpu_to_gpu( + d_vec.as_mut_c_ptr(gpu_index), + src_ptr, + size as u64, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); + + d_vec + }; + + streams.synchronize(); + + let cuda_lwe_list = CudaLweList { + d_vec, + lwe_ciphertext_count, + lwe_dimension, + ciphertext_modulus, + }; + + Some(CudaLweCiphertextList { 0: cuda_lwe_list }) + } + } + pub(crate) fn lwe_dimension(&self) -> LweDimension { self.0.lwe_dimension } diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs new file mode 100644 index 0000000000..19de6d7a18 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs @@ -0,0 +1,87 @@ +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::{convert_lwe_keyswitch_key_async, CudaStreams}; +use crate::core_crypto::prelude::{ + lwe_packing_keyswitch_key_input_key_element_encrypted_size, CiphertextModulus, + DecompositionBaseLog, DecompositionLevelCount, GlweSize, LweDimension, + LwePackingKeyswitchKeyOwned, PolynomialSize, UnsignedInteger, +}; + +/// A keyswitching key allowing to keyswitch [`an LWE ciphertext`](super::LweCiphertext) to +/// [`a GLWE ciphertext`](super::GlweCiphertext) allowing to pack several LWE ciphertexts into a +/// GLWE ciphertext. +#[derive(Debug)] +pub struct CudaLwePackingKeyswitchKey { + pub(crate) d_vec: CudaVec, + decomp_base_log: DecompositionBaseLog, + decomp_level_count: DecompositionLevelCount, + output_glwe_size: GlweSize, + output_polynomial_size: PolynomialSize, + ciphertext_modulus: CiphertextModulus, +} + +impl CudaLwePackingKeyswitchKey { + pub fn from_lwe_packing_keyswitch_key( + h_ksk: &LwePackingKeyswitchKeyOwned, + streams: &CudaStreams, + ) -> Self { + let decomp_base_log = h_ksk.decomposition_base_log(); + let decomp_level_count = h_ksk.decomposition_level_count(); + let input_lwe_size = h_ksk.input_key_lwe_dimension().to_lwe_size(); + let output_glwe_size = h_ksk.output_key_glwe_dimension().to_glwe_size(); + let output_polynomial_size = h_ksk.output_polynomial_size(); + let ciphertext_modulus = h_ksk.ciphertext_modulus(); + + // Allocate memory + let mut d_vec = CudaVec::::new_multi_gpu( + input_lwe_size.to_lwe_dimension().0 + * lwe_packing_keyswitch_key_input_key_element_encrypted_size( + decomp_level_count, + output_glwe_size, + output_polynomial_size, + ), + streams, + ); + + unsafe { + convert_lwe_keyswitch_key_async(streams, &mut d_vec, h_ksk.as_ref()); + } + + streams.synchronize(); + + Self { + d_vec, + decomp_base_log, + decomp_level_count, + output_glwe_size, + output_polynomial_size, + ciphertext_modulus, + } + } + + pub(crate) fn decomposition_base_log(&self) -> DecompositionBaseLog { + self.decomp_base_log + } + pub(crate) fn decomposition_level_count(&self) -> DecompositionLevelCount { + self.decomp_level_count + } + + pub(crate) fn output_glwe_size(&self) -> GlweSize { + self.output_glwe_size + } + pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus { + self.ciphertext_modulus + } + pub(crate) fn output_polynomial_size(&self) -> PolynomialSize { + self.output_polynomial_size + } + pub fn input_key_lwe_dimension(&self) -> LweDimension { + LweDimension( + self.d_vec.len + / lwe_packing_keyswitch_key_input_key_element_encrypted_size( + self.decomp_level_count, + self.output_glwe_size, + self.output_polynomial_size, + ), + ) + } +} diff --git a/tfhe/src/core_crypto/gpu/entities/mod.rs b/tfhe/src/core_crypto/gpu/entities/mod.rs index 38a3e27bab..f76f8b5bde 100644 --- a/tfhe/src/core_crypto/gpu/entities/mod.rs +++ b/tfhe/src/core_crypto/gpu/entities/mod.rs @@ -5,3 +5,4 @@ pub mod lwe_ciphertext; pub mod lwe_ciphertext_list; pub mod lwe_keyswitch_key; pub mod lwe_multi_bit_bootstrap_key; +pub mod lwe_packing_keyswitch_key; diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 22edcfba73..a6e57c11c8 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -275,6 +275,72 @@ pub unsafe fn convert_lwe_keyswitch_key_async( dest.copy_from_cpu_multi_gpu_async(src, streams); } +/// Discarding packing keyswitch on a single LWE ciphertext +/// +/// # Safety +/// +/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is +/// required +#[allow(clippy::too_many_arguments)] +pub unsafe fn packing_keyswitch_async( + streams: &CudaStreams, + glwe_array_out: &mut CudaVec, + lwe_array_in: &CudaVec, + input_lwe_dimension: LweDimension, + output_glwe_dimension: GlweDimension, + output_polynomial_size: PolynomialSize, + fp_keyswitch_key: &CudaVec, + base_log: DecompositionBaseLog, + l_gadget: DecompositionLevelCount, +) { + cuda_fp_keyswitch_lwe_to_glwe_64( + streams.ptr[0], + streams.gpu_indexes[0], + glwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), + fp_keyswitch_key.as_c_ptr(0), + input_lwe_dimension.0 as u32, + output_glwe_dimension.0 as u32, + output_polynomial_size.0 as u32, + base_log.0 as u32, + l_gadget.0 as u32, + ); +} + +/// Discarding packing keyswitch on a vector of LWE ciphertexts +/// +/// # Safety +/// +/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is +/// required +#[allow(clippy::too_many_arguments)] +pub unsafe fn packing_keyswitch_list_async( + streams: &CudaStreams, + glwe_array_out: &mut CudaVec, + lwe_array_in: &CudaVec, + input_lwe_dimension: LweDimension, + output_glwe_dimension: GlweDimension, + output_polynomial_size: PolynomialSize, + fp_keyswitch_key: &CudaVec, + base_log: DecompositionBaseLog, + l_gadget: DecompositionLevelCount, + num_lwes: LweCiphertextCount, +) { + cuda_fp_keyswitch_lwe_list_to_glwe_64( + streams.ptr[0], + streams.gpu_indexes[0], + glwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), + fp_keyswitch_key.as_c_ptr(0), + input_lwe_dimension.0 as u32, + output_glwe_dimension.0 as u32, + output_polynomial_size.0 as u32, + base_log.0 as u32, + l_gadget.0 as u32, + num_lwes.0 as u32, + ); +} + /// Convert programmable bootstrap key /// /// # Safety diff --git a/tfhe/src/integer/client_key/radix.rs b/tfhe/src/integer/client_key/radix.rs index 36333ddbc2..56d6493a2b 100644 --- a/tfhe/src/integer/client_key/radix.rs +++ b/tfhe/src/integer/client_key/radix.rs @@ -1,10 +1,16 @@ //! Definition of the client key for radix decomposition use super::{ClientKey, RecomposableSignedInteger, SecretEncryptionKeyView}; +#[cfg(feature = "gpu")] +use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::{SignedNumeric, UnsignedNumeric}; use crate::integer::backward_compatibility::client_key::RadixClientKeyVersions; use crate::integer::block_decomposition::{DecomposableInto, RecomposableFrom}; use crate::integer::ciphertext::{RadixCiphertext, SignedRadixCiphertext}; +#[cfg(feature = "gpu")] +use crate::integer::gpu::list_compression::server_keys::{ + CudaCompressionKey, CudaDecompressionKey, +}; use crate::integer::BooleanBlock; use crate::shortint::{Ciphertext as ShortintCiphertext, PBSParameters as ShortintParameters}; use serde::{Deserialize, Serialize}; @@ -131,6 +137,32 @@ impl RadixClientKey { pub fn num_blocks(&self) -> usize { self.num_blocks } + + pub fn new_compression_private_key( + &self, + params: CompressionParameters, + ) -> CompressionPrivateKeys { + self.key.key.new_compression_private_key(params) + } + + pub fn new_compression_decompression_keys( + &self, + private_compression_key: &CompressionPrivateKeys, + ) -> (CompressionKey, DecompressionKey) { + self.key + .key + .new_compression_decompression_keys(private_compression_key) + } + #[cfg(feature = "gpu")] + pub fn new_cuda_compression_decompression_keys( + &self, + private_compression_key: &CompressionPrivateKeys, + streams: &CudaStreams, + ) -> (CudaCompressionKey, CudaDecompressionKey) { + self.key + .key + .new_cuda_compression_decompression_keys(private_compression_key, streams) + } } impl From<(ClientKey, usize)> for RadixClientKey { diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs new file mode 100644 index 0000000000..3d2fa6bd20 --- /dev/null +++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs @@ -0,0 +1,196 @@ +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::prelude::LweCiphertextCount; +use crate::integer::ciphertext::DataKind; +use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; +use crate::integer::gpu::ciphertext::{ + CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext, + CudaUnsignedRadixCiphertext, +}; +use crate::integer::gpu::list_compression::server_keys::{ + CudaCompressionKey, CudaDecompressionKey, +}; +use itertools::Itertools; +use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu; + +pub struct CudaCompressedCiphertextList { + pub(crate) packed_list: (CudaGlweCiphertextList, Vec), + info: Vec, +} +impl CudaCompressedCiphertextList { + pub fn len(&self) -> usize { + self.info.len() + } + + pub fn is_empty(&self) -> bool { + self.info.len() == 0 + } + + pub fn blocks_of( + &self, + index: usize, + decomp_key: &CudaDecompressionKey, + streams: &CudaStreams, + gpu_index: u32, + ) -> Option<(CudaRadixCiphertext, DataKind)> { + let preceding_infos = self.info.get(..index)?; + let current_info = self.info.get(index).copied()?; + + let start_block_index: usize = preceding_infos + .iter() + .copied() + .map(DataKind::num_blocks) + .sum(); + + let end_block_index = start_block_index + current_info.num_blocks(); + + Some(( + decomp_key.unpack( + &self.packed_list, + start_block_index, + end_block_index, + streams, + gpu_index, + ), + current_info, + )) + } +} + +pub trait CudaCompressible { + fn compress_into( + self, + messages: &mut Vec, + streams: &CudaStreams, + ) -> DataKind; +} + +// Todo: Can we combine these two impl using CudaIntegerRadixCiphertext? +impl CudaCompressible for CudaSignedRadixCiphertext { + fn compress_into( + self, + messages: &mut Vec, + streams: &CudaStreams, + ) -> DataKind { + let x = self.ciphertext.duplicate(streams); + + let copy = x.duplicate(streams); + messages.push(copy); + + let num_blocks = x.d_blocks.lwe_ciphertext_count().0; + DataKind::Signed(num_blocks) + } +} +impl CudaCompressible for CudaUnsignedRadixCiphertext { + fn compress_into( + self, + messages: &mut Vec, + streams: &CudaStreams, + ) -> DataKind { + let x = self.ciphertext.duplicate(streams); + + let copy = x.duplicate(streams); + messages.push(copy); + + let num_blocks = x.d_blocks.lwe_ciphertext_count().0; + + DataKind::Unsigned(num_blocks) + } +} + +pub struct CompressedCudaCiphertextListBuilder { + pub(crate) ciphertexts: Vec, + pub(crate) info: Vec, +} + +impl CompressedCudaCiphertextListBuilder { + #[allow(clippy::new_without_default)] + pub fn new() -> Self { + Self { + ciphertexts: vec![], + info: vec![], + } + } + + /// ```rust + /// use tfhe::CompressedCiphertextListBuilder;use tfhe::core_crypto::gpu::CudaStreams;use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext}; + /// use tfhe::integer::gpu::gen_keys_radix_gpu;use tfhe::integer::{IntegerCiphertext, RadixCiphertext, RadixClientKey, SignedRadixCiphertext}; + /// use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CompressedCudaCiphertextListBuilder; + /// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + /// use tfhe::shortint::prelude::PARAM_MESSAGE_2_CARRY_2_KS_PBS; + /// use tfhe::integer::gpu::list_compression::server_keys::*; + /// + /// let gpu_index = 0; + /// let mut streams = CudaStreams::new_single_gpu(gpu_index); + /// + /// // Generate the client key and the server key: + /// let num_blocks = 4; + /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, num_blocks, &mut streams); /// + /// + /// let private_compression_key = + /// cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); + /// + /// let (cuda_compression_key, cuda_decompression_key) = + /// cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams); + /// + /// let ct1 = cks.encrypt(3_u32); + /// let ct2 = cks.encrypt(2_u32); + /// let ct3 = cks.encrypt_signed(-2); + /// // Copy to GPU + /// let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams); + /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams); + /// let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &mut streams); + /// + /// let compressed = CompressedCudaCiphertextListBuilder::new() + /// .push(d_ct1, &streams) + /// .push(d_ct2, &streams).push(d_ct3, &streams). + /// build(&cuda_compression_key, &streams,0); + /// + /// let a = CudaUnsignedRadixCiphertext {ciphertext: compressed.blocks_of(0, &cuda_decompression_key, + /// &streams, 0).unwrap().0}; + /// + /// let result = a.to_radix_ciphertext(&streams); + /// let decrypted: u32 = cks.decrypt(&result); + /// assert_eq!(decrypted, 3_u32); + /// + /// let b = CudaUnsignedRadixCiphertext {ciphertext: compressed.blocks_of(1, + /// &cuda_decompression_key, &streams, 0).unwrap().0}; + /// + /// let result = b.to_radix_ciphertext(&streams); + /// let decrypted: u32 = + /// cks.decrypt(&result); + /// assert_eq!(decrypted, 2_u32); + /// + /// let c = CudaSignedRadixCiphertext {ciphertext: compressed.blocks_of(2, + /// &cuda_decompression_key, &streams, 0).unwrap().0}; + /// + /// let result = c.to_signed_radix_ciphertext(&streams); + /// let decrypted: i32 = + /// cks.decrypt_signed(&result); + /// assert_eq!(decrypted, -2); + pub fn push(&mut self, data: T, streams: &CudaStreams) -> &mut Self { + let kind = data.compress_into(&mut self.ciphertexts, streams); + + if kind.num_blocks() != 0 { + self.info.push(kind); + } + + self + } + + pub fn build( + &self, + comp_key: &CudaCompressionKey, + streams: &CudaStreams, + gpu_index: u32, + ) -> CudaCompressedCiphertextList { + let packed_list = + comp_key.compress_ciphertexts_into_list(&self.ciphertexts, streams, gpu_index); + CudaCompressedCiphertextList { + packed_list: packed_list, + info: self.info.clone(), + } + } +} diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs index 7f0408eddb..51e702d0c8 100644 --- a/tfhe/src/integer/gpu/ciphertext/mod.rs +++ b/tfhe/src/integer/gpu/ciphertext/mod.rs @@ -1,4 +1,5 @@ pub mod boolean_value; +pub mod compressed_ciphertext_list; pub mod info; use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; diff --git a/tfhe/src/integer/gpu/list_compression/mod.rs b/tfhe/src/integer/gpu/list_compression/mod.rs new file mode 100644 index 0000000000..7cee096af1 --- /dev/null +++ b/tfhe/src/integer/gpu/list_compression/mod.rs @@ -0,0 +1 @@ +pub mod server_keys; diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs new file mode 100644 index 0000000000..d21a4f162c --- /dev/null +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -0,0 +1,300 @@ +use crate::core_crypto::entities::LweCiphertext; +use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::prelude::{ + allocate_and_generate_new_lwe_packing_keyswitch_key, + par_allocate_and_generate_new_lwe_bootstrap_key, CiphertextModulusLog, GlweCiphertextCount, + LweBootstrapKeyOwned, LweCiphertextCount, LweCiphertextList, LweSize, +}; +use crate::integer::ciphertext::DataKind; +use crate::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextList; +use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; +use crate::integer::gpu::ciphertext::CudaRadixCiphertext; +use crate::integer::gpu::server_key::CudaBootstrappingKey; +use crate::integer::gpu::{ + cuda_memcpy_async_gpu_to_gpu, unchecked_compression_compress_integer_radix_async, PBSType, +}; +use crate::shortint::client_key::ClientKey; +use crate::shortint::engine::ShortintEngine; +use crate::shortint::list_compression::{CompressionKey, CompressionPrivateKeys}; +use crate::shortint::{ + CiphertextModulus, ClassicPBSParameters, EncryptionKeyChoice, PBSParameters, +}; +use itertools::Itertools; + +#[derive(Debug)] +pub struct CudaCompressionKey { + pub packing_key_switching_key: CudaLwePackingKeyswitchKey, + pub lwe_per_glwe: LweCiphertextCount, + pub storage_log_modulus: CiphertextModulusLog, +} + +pub struct CudaDecompressionKey { + pub blind_rotate_key: CudaBootstrappingKey, + pub lwe_per_glwe: LweCiphertextCount, + pub ciphertext_modulus: CiphertextModulus, //TODO: Remove this +} + +impl CudaCompressionKey { + pub fn from_compression_key(compression_key: &CompressionKey, streams: &CudaStreams) -> Self { + CudaCompressionKey { + packing_key_switching_key: CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key( + &compression_key.packing_key_switching_key, + streams, + ), + lwe_per_glwe: compression_key.lwe_per_glwe, + storage_log_modulus: compression_key.storage_log_modulus, + } + } + + fn flatten( + &self, + vec_ciphertexts: &Vec, + streams: &CudaStreams, + gpu_index: u32, + ) -> (CudaLweCiphertextList, Vec) { + let first_ct = &vec_ciphertexts.first().unwrap().d_blocks; + + // We assume all ciphertexts will have the same lwe dimension + let lwe_dimension = first_ct.lwe_dimension(); + let ciphertext_modulus = first_ct.ciphertext_modulus(); + + // Compute total number of lwe ciphertexts we will be handling + let total_num_blocks: usize = vec_ciphertexts + .iter() + .map(|x| x.d_blocks.lwe_ciphertext_count().0) + .sum(); + + let lwe_ciphertext_count = LweCiphertextCount(total_num_blocks); + + let d_vec = unsafe { + let mut d_vec = CudaVec::new_async( + lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0, + streams, + gpu_index, + ); + let mut offset: usize = 0; + for ciphertext in vec_ciphertexts { + // Todo: We might use copy_self_range_gpu_to_gpu_async here + let dest_ptr = d_vec + .as_mut_c_ptr(gpu_index) + .add(offset * std::mem::size_of::()); + let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::(); + cuda_memcpy_async_gpu_to_gpu( + dest_ptr, + ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index), + size as u64, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); + + offset += ciphertext.d_blocks.0.d_vec.len; + } + + streams.synchronize(); + d_vec + }; + + let flattened_ciphertexts = + CudaLweCiphertextList::from_cuda_vec(d_vec, lwe_ciphertext_count, ciphertext_modulus); + + let info = vec_ciphertexts + .iter() + .flat_map(|x| x.info.blocks.clone()) + .collect_vec(); + + (flattened_ciphertexts, info) + } + + pub fn compress_ciphertexts_into_list( + &self, + ciphertexts: &Vec, + streams: &CudaStreams, + gpu_index: u32, + ) -> (CudaGlweCiphertextList, Vec) { + let lwe_pksk = &self.packing_key_switching_key; + + let polynomial_size = lwe_pksk.output_polynomial_size(); + let ciphertext_modulus = lwe_pksk.ciphertext_modulus(); + let glwe_size = lwe_pksk.output_glwe_size(); + let lwe_size = lwe_pksk.input_key_lwe_dimension().to_lwe_size(); + println!("compress lwe_size: {:?}", lwe_size); + println!("compress polynomial_size: {:?}", polynomial_size); + + let first_ct_info = ciphertexts.first().unwrap().info.blocks.first().unwrap(); + let message_modulus = first_ct_info.message_modulus; + let carry_modulus = first_ct_info.carry_modulus; + + let num_lwes: usize = ciphertexts + .iter() + .map(|x| x.d_blocks.lwe_ciphertext_count().0) + .sum(); + + let mut output_glwe = CudaGlweCiphertextList::new( + glwe_size.to_glwe_dimension(), + polynomial_size, + GlweCiphertextCount(ciphertexts.len()), + ciphertext_modulus, + streams, + ); + + let (input_lwes, info) = self.flatten(ciphertexts, streams, gpu_index); + + unsafe { + unchecked_compression_compress_integer_radix_async( + streams, + &mut output_glwe.0.d_vec, + &input_lwes.0.d_vec, + &self.packing_key_switching_key.d_vec, + message_modulus, + carry_modulus, + glwe_size.to_glwe_dimension(), + polynomial_size, + lwe_size.to_lwe_dimension(), + lwe_pksk.decomposition_base_log(), + lwe_pksk.decomposition_level_count(), + self.lwe_per_glwe.0 as u32, + self.storage_log_modulus.0 as u32, + num_lwes as u32, + ); + } + + (output_glwe, info) + } +} + +impl CudaDecompressionKey { + pub fn unpack( + &self, + packed_list: &(CudaGlweCiphertextList, Vec), + start_block_index: usize, + end_block_index: usize, + streams: &CudaStreams, + gpu_index: u32, + ) -> CudaRadixCiphertext { + let glwe_dimension = packed_list.0.glwe_dimension(); + let polynomial_size = packed_list.0.polynomial_size(); + let lwe_ciphertext_count = LweCiphertextCount(end_block_index - start_block_index); + //let lwe_size = self.blind_rotate_key.output_lwe_dimension().to_lwe_size(); + let lwe_size = LweSize(0); + println!("decompress lwe_size: {:?}", lwe_size); + println!( + "decompress lwe_ciphertext_count: {:?}", + lwe_ciphertext_count + ); + println!("decompress polynomial_size: {:?}", polynomial_size); + let output_lwe = CudaLweCiphertextList::new( + lwe_size.to_lwe_dimension(), + lwe_ciphertext_count, + self.ciphertext_modulus, + streams, + ); + unsafe { + // unchecked_compression_compress_integer_radix_async(streams, + // &mut output_glwe.0.d_vec, + // &input_lwes.0.d_vec, + // + // &self.packing_key_switching_key.d_vec, + // message_modulus, + // carry_modulus, + // glwe_size.to_glwe_dimension(), + // polynomial_size, + // lwe_size.to_lwe_dimension(), + // lwe_pksk.decomposition_base_log(), + // lwe_pksk.decomposition_level_count(), + // self.lwe_per_glwe.0 as u32, + // self.storage_log_modulus.0 as u32, + // num_lwes as u32, ); + } + + CudaRadixCiphertext { + d_blocks: output_lwe, + info: CudaRadixCiphertextInfo { + blocks: packed_list.1.clone(), + }, + } + } +} + +impl ClientKey { + pub fn new_cuda_compression_decompression_keys( + &self, + private_compression_key: &CompressionPrivateKeys, + streams: &CudaStreams, + ) -> (CudaCompressionKey, CudaDecompressionKey) { + let params = &private_compression_key.params; + let cks_params: ClassicPBSParameters = match self.parameters.pbs_parameters().unwrap() { + PBSParameters::PBS(a) => a, + PBSParameters::MultiBitPBS(_) => { + panic!("Compression is currently not compatible with Multi Bit PBS") + } + }; + + assert_eq!( + cks_params.encryption_key_choice, + EncryptionKeyChoice::Big, + "Compression is only compatible with ciphertext in post PBS dimension" + ); + + let packing_key_switching_key = ShortintEngine::with_thread_local_mut(|engine| { + allocate_and_generate_new_lwe_packing_keyswitch_key( + &self.large_lwe_secret_key(), + &private_compression_key.post_packing_ks_key, + params.packing_ks_base_log, + params.packing_ks_level, + params.packing_ks_key_noise_distribution, + self.parameters.ciphertext_modulus(), + &mut engine.encryption_generator, + ) + }); + + assert!( + private_compression_key.params.storage_log_modulus.0 + <= cks_params + .polynomial_size + .to_blind_rotation_input_modulus_log() + .0, + "Compression parameters say to store more bits than useful" + ); + + let glwe_compression_key = CompressionKey { + packing_key_switching_key, + lwe_per_glwe: params.lwe_per_glwe, + storage_log_modulus: private_compression_key.params.storage_log_modulus, + }; + + let mut engine = ShortintEngine::new(); + let h_bootstrap_key: LweBootstrapKeyOwned = + par_allocate_and_generate_new_lwe_bootstrap_key( + &private_compression_key + .post_packing_ks_key + .as_lwe_secret_key(), + &self.glwe_secret_key, + private_compression_key.params.br_base_log, + private_compression_key.params.br_level, + self.parameters.glwe_noise_distribution(), + self.parameters.ciphertext_modulus(), + &mut engine.encryption_generator, + ); + + let d_bootstrap_key = + CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams); + + let blind_rotate_key = CudaBootstrappingKey::Classic(d_bootstrap_key); + + let cuda_glwe_decompression_key = CudaDecompressionKey { + blind_rotate_key, + lwe_per_glwe: params.lwe_per_glwe, + ciphertext_modulus: self.parameters.ciphertext_modulus(), + }; + + ( + CudaCompressionKey::from_compression_key(&glwe_compression_key, streams), + cuda_glwe_decompression_key, + ) + } +} diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 654ddce257..4406b72a15 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -1,6 +1,6 @@ pub mod ciphertext; +pub mod list_compression; pub mod server_key; - use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut}; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::gpu::{get_max_shared_memory, CudaStreams}; @@ -12,6 +12,7 @@ use crate::integer::{ClientKey, RadixClientKey}; use crate::shortint::{CarryModulus, MessageModulus}; pub use server_key::CudaServerKey; use std::cmp::min; + use tfhe_cuda_backend::cuda_bind::*; #[repr(u32)] @@ -32,6 +33,13 @@ pub enum PBSType { Classical = 1, } +#[allow(dead_code)] +#[repr(u32)] +pub enum CompressionType { + Compress = 0, + Decompress = 1, +} + #[repr(u32)] pub enum ShiftRotateType { LeftShift = 0, @@ -273,6 +281,157 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async( + streams: &CudaStreams, + glwe_array_out: &mut CudaVec, + lwe_array_in: &CudaVec, + fp_keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + lwe_dimension: LweDimension, + ks_base_log: DecompositionBaseLog, + ks_level: DecompositionLevelCount, + lwe_per_glwe: u32, + storage_log_modulus: u32, + num_blocks: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + lwe_array_in.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + fp_keyswitch_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + scratch_cuda_compression_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + 0 as u32, + 0 as u32, + 0 as u32, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + PBSType::Classical as u32, + lwe_per_glwe, + storage_log_modulus, + CompressionType::Compress as u32, + true, + ); + + cuda_compression_compress_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + glwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), + fp_keyswitch_key.ptr.as_ptr(), + num_blocks, + mem_ptr, + ); + + cleanup_cuda_compression_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization +/// is required +pub unsafe fn unchecked_compression_decompress_integer_radix_async< + T: UnsignedInteger, + B: Numeric, +>( + streams: &CudaStreams, + lwe_out: &mut CudaVec, + glwe_array_in: &CudaVec, + bootstrapping_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + lwe_dimension: LweDimension, + pbs_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + lwe_per_glwe: u32, + storage_log_modulus: u32, + num_blocks: u32, + grouping_factor: LweBskGroupingFactor, +) { + assert_eq!( + streams.gpu_indexes[0], + glwe_array_in.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + bootstrapping_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + scratch_cuda_compression_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + lwe_dimension.0 as u32, + 0 as u32, + 0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + PBSType::Classical as u32, + lwe_per_glwe, + storage_log_modulus, + CompressionType::Decompress as u32, + true, + ); + + cuda_compression_decompress_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + lwe_out.as_mut_c_ptr(0), + glwe_array_in.as_c_ptr(0), + bootstrapping_key.ptr.as_ptr(), + mem_ptr, + ); + + cleanup_cuda_compression_integer_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} + #[allow(clippy::too_many_arguments)] /// # Safety ///