diff --git a/backends/tfhe-cuda-backend/cuda/include/compression.h b/backends/tfhe-cuda-backend/cuda/include/compression.h new file mode 100644 index 0000000000..26cef85dd6 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/include/compression.h @@ -0,0 +1,155 @@ +#ifndef CUDA_INTEGER_COMPRESSION_H +#define CUDA_INTEGER_COMPRESSION_H + +#include "integer.h" + +extern "C" { +void scratch_cuda_integer_compress_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, + uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, + PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, + bool allocate_gpu_memory); +void scratch_cuda_integer_decompress_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size, + uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, + uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_lwes, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t lwe_per_glwe, + uint32_t storage_log_modulus, bool allocate_gpu_memory); + +void cuda_integer_compress_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths, + int8_t *mem_ptr); + +void cuda_integer_decompress_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void *glwe_in, void *indexes_array, + uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr); + +void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); +void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void); +} + +template struct int_compression { + int_radix_params compression_params; + uint32_t storage_log_modulus; + uint32_t lwe_per_glwe; + + uint32_t body_count; + + // Compression + int8_t *fp_ks_buffer; + Torus *tmp_lwe; + Torus *tmp_glwe_array_out; + + int_compression(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params compression_params, + uint32_t num_radix_blocks, uint32_t lwe_per_glwe, + uint32_t storage_log_modulus, bool allocate_gpu_memory) { + this->compression_params = compression_params; + this->lwe_per_glwe = lwe_per_glwe; + this->storage_log_modulus = storage_log_modulus; + this->body_count = num_radix_blocks; + + if (allocate_gpu_memory) { + Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) * + compression_params.polynomial_size; + + tmp_lwe = (Torus *)cuda_malloc_async( + num_radix_blocks * (compression_params.small_lwe_dimension + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + tmp_glwe_array_out = (Torus *)cuda_malloc_async( + glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]); + + scratch_fp_keyswitch_lwe_list_to_glwe_64( + streams[0], gpu_indexes[0], &fp_ks_buffer, + compression_params.glwe_dimension, compression_params.polynomial_size, + num_radix_blocks, true); + } + } + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]); + cleanup_fp_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0], + &fp_ks_buffer); + } +}; + +template struct int_decompression { + int_radix_params encryption_params; + int_radix_params compression_params; + + uint32_t storage_log_modulus; + + uint32_t body_count; + + Torus *tmp_extracted_glwe; + Torus *tmp_extracted_lwe; + + int_radix_lut *carry_extract_lut; + + int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, int_radix_params encryption_params, + int_radix_params compression_params, + uint32_t num_radix_blocks, uint32_t lwe_per_glwe, + uint32_t storage_log_modulus, bool allocate_gpu_memory) { + this->encryption_params = encryption_params; + this->compression_params = compression_params; + this->storage_log_modulus = storage_log_modulus; + this->body_count = num_radix_blocks; + + if (allocate_gpu_memory) { + Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) * + compression_params.polynomial_size; + + carry_extract_lut = new int_radix_lut( + streams, gpu_indexes, gpu_count, encryption_params, 1, + num_radix_blocks, allocate_gpu_memory); + + tmp_extracted_glwe = (Torus *)cuda_malloc_async( + glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]); + tmp_extracted_lwe = (Torus *)cuda_malloc_async( + num_radix_blocks * + (compression_params.glwe_dimension * + compression_params.polynomial_size + + 1) * + sizeof(Torus), + streams[0], gpu_indexes[0]); + // Decompression + // Carry extract LUT + auto carry_extract_f = [encryption_params](Torus x) -> Torus { + return x / encryption_params.message_modulus; + }; + + generate_device_accumulator( + streams[0], gpu_indexes[0], + carry_extract_lut->get_lut(gpu_indexes[0], 0), + encryption_params.glwe_dimension, encryption_params.polynomial_size, + encryption_params.message_modulus, encryption_params.carry_modulus, + carry_extract_f); + + carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + } + } + void release(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count) { + cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]); + cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]); + + carry_extract_lut->release(streams, gpu_indexes, gpu_count); + delete (carry_extract_lut); + } +}; +#endif \ No newline at end of file diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 871829abeb..99860fd1c7 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -1,6 +1,7 @@ #ifndef CUDA_INTEGER_H #define CUDA_INTEGER_H +#include "keyswitch.h" #include "pbs/programmable_bootstrap.cuh" #include "programmable_bootstrap.h" #include "programmable_bootstrap_multibit.h" @@ -15,7 +16,6 @@ enum SHIFT_OR_ROTATE_TYPE { LEFT_ROTATE = 2, RIGHT_ROTATE = 3 }; -enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 }; enum BITOP_TYPE { BITAND = 0, BITOR = 1, @@ -475,7 +475,8 @@ struct int_radix_params { message_modulus(message_modulus), carry_modulus(carry_modulus){}; void print() { - printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, " + printf("pbs_type: %u, glwe_dimension: %u, " + "polynomial_size: %u, " "big_lwe_dimension: %u, " "small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: " "%u, pbs_base_log: " @@ -812,7 +813,6 @@ template struct int_radix_lut { } } }; - template struct int_bit_extract_luts_buffer { int_radix_params params; int_radix_lut *lut; diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h index 924ec131f9..a16d307cb3 100644 --- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h +++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h @@ -16,6 +16,22 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples); + +void scratch_fp_keyswitch_lwe_list_to_glwe_64(void *stream, uint32_t gpu_index, + int8_t **fp_ks_buffer, + uint32_t glwe_dimension, + uint32_t polynomial_size, + uint32_t num_lwes, + bool allocate_gpu_memory); + +void cuda_fp_keyswitch_lwe_list_to_glwe_64( + void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in, + void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t num_lwes); + +void cleanup_fp_keyswitch_lwe_list_to_glwe(void *stream, uint32_t gpu_index, + int8_t **fp_ks_buffer); } #endif // CNCRT_KS_H_ diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt index 3190501fc5..5fec699fbc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt +++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt @@ -1,17 +1,3 @@ -set(SOURCES - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h - ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h) file(GLOB_RECURSE SOURCES "*.cu") add_library(tfhe_cuda_backend STATIC ${SOURCES}) set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON) diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh index b45353a616..9971b5c000 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh @@ -38,8 +38,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in, auto lwe_out = lwe_array_out + input_id * lwe_output_size; // We assume each GLWE will store the first polynomial_size inputs - uint32_t nth_per_glwe = params::degree; - auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size; + uint32_t lwe_per_glwe = params::degree; + auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size; auto nth = nth_array[input_id]; @@ -50,11 +50,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in, template __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *glwe_array_in, - uint32_t *nth_array, uint32_t num_glwes, + uint32_t *nth_array, uint32_t num_nths, uint32_t glwe_dimension) { cudaSetDevice(gpu_index); - dim3 grid(num_glwes); + dim3 grid(num_nths); dim3 thds(params::degree / params::opt); sample_extract<<>>( lwe_array_out, glwe_array_in, nth_array, glwe_dimension); diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu index 524a1fa45e..f267ca71bf 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu @@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { - cuda_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), @@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes, void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count, uint32_t num_samples) { - cuda_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( static_cast(stream), gpu_index, static_cast(lwe_array_out), static_cast(lwe_output_indexes), @@ -48,3 +48,36 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(ksk), lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); } + +void scratch_fp_keyswitch_lwe_list_to_glwe_64(void *stream, uint32_t gpu_index, + int8_t **fp_ks_buffer, + uint32_t glwe_dimension, + uint32_t polynomial_size, + uint32_t num_lwes, + bool allocate_gpu_memory) { + scratch_fp_keyswitch_lwe_list_to_glwe( + static_cast(stream), gpu_index, fp_ks_buffer, + glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory); +} +/* Perform functional packing keyswitch on a batch of 64 bits input LWE + * ciphertexts. + */ +void cuda_fp_keyswitch_lwe_list_to_glwe_64( + void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in, + void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension, + uint32_t output_glwe_dimension, uint32_t output_polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t num_lwes) { + + host_fp_keyswitch_lwe_list_to_glwe( + static_cast(stream), gpu_index, + static_cast(glwe_array_out), + static_cast(lwe_array_in), + static_cast(fp_ksk_array), fp_ks_buffer, input_lwe_dimension, + output_glwe_dimension, output_polynomial_size, base_log, level_count, + num_lwes); +} + +void cleanup_fp_keyswitch_lwe_list_to_glwe(void *stream, uint32_t gpu_index, + int8_t **fp_ks_buffer) { + cuda_drop_async(*fp_ks_buffer, static_cast(stream), gpu_index); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh index db78104a98..91f714893e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh @@ -7,6 +7,7 @@ #include "polynomial/functions.cuh" #include "polynomial/polynomial_math.cuh" #include "torus.cuh" +#include "utils/helper.cuh" #include "utils/kernel_dimensions.cuh" #include #include @@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, } template -__host__ void cuda_keyswitch_lwe_ciphertext_vector( +__host__ void host_keyswitch_lwe_ciphertext_vector( cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, @@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes, GET_VARIANT_ELEMENT(lwe_input_indexes, i); // Compute Keyswitch - cuda_keyswitch_lwe_ciphertext_vector( + host_keyswitch_lwe_ciphertext_vector( streams[i], gpu_indexes[i], current_lwe_array_out, current_lwe_output_indexes, current_lwe_array_in, current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out, @@ -154,4 +155,170 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes, } } +template +__host__ void scratch_fp_keyswitch_lwe_list_to_glwe( + cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes, + bool allocate_gpu_memory) { + cudaSetDevice(gpu_index); + + int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; + + if (allocate_gpu_memory) + *fp_ks_buffer = (int8_t *)cuda_malloc_async( + 2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream, + gpu_index); +} + +// public functional packing keyswitch for a single LWE ciphertext +// +// Assumes there are (glwe_dimension+1) * polynomial_size threads split through +// different thread blocks at the x-axis to work on that input. +template +__device__ void fp_keyswitch_lwe_ciphertext_into_glwe_ciphertext( + Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count) { + + extern __shared__ int8_t sharedmem[]; + + // result accumulator, shared memory is used because of frequent access + Torus *local_glwe_chunk = (Torus *)sharedmem; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + size_t glwe_size = (glwe_dimension + 1); + + if (tid < glwe_size * polynomial_size) { + const int shmem_index = threadIdx.x; + // the output_glwe is split in polynomials and each x-block takes one of + // them + size_t poly_id = blockIdx.x; + size_t coef_per_block = blockDim.x; + + // number of coefficients inside fp-ksk block for each lwe_input coefficient + size_t ksk_block_size = glwe_size * polynomial_size * level_count; + + // initialize accumulator to 0 + local_glwe_chunk[shmem_index] = SEL( + 0, lwe_in[lwe_dimension_in], tid == glwe_dimension * polynomial_size); + + // Iterate through all lwe elements + for (int i = 0; i < lwe_dimension_in; i++) { + // Round and prepare decomposition + Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count); + + Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count); + Torus mod_b_mask = (1ll << base_log) - 1ll; + + // block of key for current lwe coefficient (cur_input_lwe[i]) + auto ksk_block = &fp_ksk[i * ksk_block_size]; + for (int j = 0; j < level_count; j++) { + auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size]; + // Iterate through each level and multiply by the ksk piece + auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block]; + Torus decomposed = decompose_one(state, mod_b_mask, base_log); + local_glwe_chunk[shmem_index] -= + decomposed * ksk_glwe_chunk[shmem_index]; + } + } + + // Persist + glwe_out[tid] = local_glwe_chunk[shmem_index]; + } +} + +// public functional packing keyswitch for a batch of LWE ciphertexts +// +// Selects the input each thread is working on using the y-block index. +// +// Assumes there are (glwe_dimension+1) * polynomial_size threads split through +// different thread blocks at the x-axis to work on that input. +template +__global__ void +fp_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in, + Torus *fp_ksk, uint32_t lwe_dimension_in, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, + Torus *d_mem) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; + const int lwe_size = (lwe_dimension_in + 1); + + const int input_id = blockIdx.y; + const int degree = input_id; + + // Select an input + auto lwe_in = lwe_array_in + input_id * lwe_size; + auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size; + auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size; + // KS LWE to GLWE + fp_keyswitch_lwe_ciphertext_into_glwe_ciphertext( + ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension, + polynomial_size, base_log, level_count); + + // P * x ^degree + auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size; + auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size; + polynomial_accumulate_monic_monomial_mul(out_result, in_poly, degree, + tid % polynomial_size, + polynomial_size, 1, true); +} + +/// To-do: Rewrite this kernel for efficiency +template +__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in, + uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t num_lwes) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < (glwe_dimension + 1) * polynomial_size) { + extern __shared__ int8_t sharedmem[]; + const int shmem_index = threadIdx.x; + + Torus *glwe_acc = (Torus *)sharedmem; + glwe_acc[shmem_index] = glwe_array_in[tid]; + + // Accumulate + for (int i = 1; i < num_lwes; i++) { + auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size; + glwe_acc[shmem_index] += glwe_in[tid]; + } + + glwe_out[tid] = glwe_acc[shmem_index]; + } +} + +template +__host__ void host_fp_keyswitch_lwe_list_to_glwe( + cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out, + Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer, + uint32_t lwe_dimension_in, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + uint32_t num_lwes) { + cudaSetDevice(gpu_index); + int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size; + + int num_blocks = 0, num_threads = 0; + getNumBlocksAndThreads(glwe_accumulator_size, 512, num_blocks, num_threads); + + auto shared_mem = sizeof(Torus) * num_threads; + dim3 grid(num_blocks, num_lwes); + dim3 threads(num_threads); + + auto d_mem = (Torus *)fp_ks_buffer; + auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size; + + // individually keyswitch each lwe + fp_keyswitch_lwe_list_to_glwe<<>>( + d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in, + glwe_dimension, polynomial_size, base_log, level_count, d_mem); + check_cuda_error(cudaGetLastError()); + + // accumulate to a single glwe + accumulate_glwes<<>>( + glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size, + num_lwes); + check_cuda_error(cudaGetLastError()); +} + #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh index 177892e5a9..4a5cd8fb66 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh @@ -2,6 +2,7 @@ #define CNCRT_TORUS_CUH #include "types/int128.cuh" +#include "utils/kernel_dimensions.cuh" #include template @@ -29,20 +30,18 @@ __device__ inline void typecast_double_to_torus(double x, template __device__ inline T round_to_closest_multiple(T x, uint32_t base_log, uint32_t level_count) { - T shift = sizeof(T) * 8 - level_count * base_log; - T mask = 1ll << (shift - 1); - T b = (x & mask) >> (shift - 1); + const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log; + const T shift = non_rep_bit_count - 1; T res = x >> shift; - res += b; - res <<= shift; - return res; + res += 1; + res &= (T)(-2); + return res << shift; } template __device__ __forceinline__ void modulus_switch(T input, T &output, uint32_t log_modulus) { constexpr uint32_t BITS = sizeof(T) * 8; - output = input + (((T)1) << (BITS - log_modulus - 1)); output >>= (BITS - log_modulus); } @@ -54,4 +53,27 @@ __device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) { return output; } +template +__global__ void modulus_switch_inplace(Torus *array, int size, + uint32_t log_modulus) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < size) { + array[tid] = modulus_switch(array[tid], log_modulus); + } +} + +template +__host__ void host_modulus_switch_inplace(cudaStream_t stream, + uint32_t gpu_index, Torus *array, + int size, uint32_t log_modulus) { + cudaSetDevice(gpu_index); + + int num_threads = 0, num_blocks = 0; + getNumBlocksAndThreads(size, 1024, num_blocks, num_threads); + + modulus_switch_inplace<<>>(array, size, + log_modulus); + check_cuda_error(cudaGetLastError()); +} + #endif // CNCRT_TORUS_H diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu new file mode 100644 index 0000000000..4792c3197f --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu @@ -0,0 +1,89 @@ +#include "compression.cuh" + +void scratch_cuda_integer_compress_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, + uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, + PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus, + bool allocate_gpu_memory) { + + int_radix_params compression_params( + pbs_type, compression_glwe_dimension, compression_polynomial_size, + (compression_glwe_dimension + 1) * compression_polynomial_size, + lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus, + carry_modulus); + + scratch_cuda_compress_integer_radix_ciphertext_64( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_compression **)mem_ptr, num_lwes, compression_params, + lwe_per_glwe, storage_log_modulus, allocate_gpu_memory); +} +void scratch_cuda_integer_decompress_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, + uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size, + uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, + uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log, + uint32_t grouping_factor, uint32_t num_lwes, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t lwe_per_glwe, + uint32_t storage_log_modulus, bool allocate_gpu_memory) { + + int_radix_params encryption_params( + pbs_type, encryption_glwe_dimension, encryption_polynomial_size, + (encryption_glwe_dimension + 1) * encryption_polynomial_size, + lwe_dimension, 0, 0, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus); + + int_radix_params compression_params( + pbs_type, compression_glwe_dimension, compression_polynomial_size, + (compression_glwe_dimension + 1) * compression_polynomial_size, + lwe_dimension, 0, 0, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus); + + scratch_cuda_integer_decompress_radix_ciphertext_64( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + (int_decompression **)mem_ptr, num_lwes, encryption_params, + compression_params, lwe_per_glwe, storage_log_modulus, + allocate_gpu_memory); +} +void cuda_integer_compress_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths, + int8_t *mem_ptr) { + + host_integer_compress( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(glwe_array_out), + static_cast(lwe_array_in), (uint64_t **)(fp_ksk), num_nths, + (int_compression *)mem_ptr); +} +void cuda_integer_decompress_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + void *lwe_array_out, void *glwe_in, void *indexes_array, + uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) { + + host_integer_decompress( + (cudaStream_t *)(streams), gpu_indexes, gpu_count, + static_cast(lwe_array_out), static_cast(glwe_in), + static_cast(indexes_array), indexes_array_size, bsks, + (int_decompression *)mem_ptr); +} + +void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams, + uint32_t *gpu_indexes, + uint32_t gpu_count, + int8_t **mem_ptr_void) { + + int_compression *mem_ptr = + (int_compression *)(*mem_ptr_void); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} + +void cleanup_cuda_integer_decompress_radix_ciphertext_64( + void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int8_t **mem_ptr_void) { + + int_decompression *mem_ptr = + (int_decompression *)(*mem_ptr_void); + mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh new file mode 100644 index 0000000000..932539dd8e --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -0,0 +1,239 @@ +#ifndef CUDA_INTEGER_COMPRESSION_CUH +#define CUDA_INTEGER_COMPRESSION_CUH + +#include "ciphertext.h" +#include "compression.h" +#include "crypto/keyswitch.cuh" +#include "device.h" +#include "integer/integer.cuh" +#include "linearalgebra/multiplication.cuh" +#include "polynomial/functions.cuh" +#include "utils/kernel_dimensions.cuh" + +template +__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus, + uint32_t in_len, uint32_t len) { + auto nbits = sizeof(Torus) * 8; + + auto i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < len) { + auto k = nbits * i / log_modulus; + auto j = k; + + auto start_shift = i * nbits - j * log_modulus; + + auto value = array_in[j] >> start_shift; + j++; + + while (j * log_modulus < ((i + 1) * nbits) && j < in_len) { + auto shift = j * log_modulus - i * nbits; + value |= array_in[j] << shift; + j++; + } + + array_out[i] = value; + } +} + +template +__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index, + Torus *array_out, Torus *array_in, uint32_t num_inputs, + uint32_t body_count, int_compression *mem_ptr) { + cudaSetDevice(gpu_index); + auto params = mem_ptr->compression_params; + + auto log_modulus = mem_ptr->storage_log_modulus; + auto in_len = params.glwe_dimension * params.polynomial_size + body_count; + auto number_bits_to_pack = in_len * log_modulus; + + auto nbits = sizeof(Torus) * 8; + // number_bits_to_pack.div_ceil(Scalar::BITS) + auto len = (number_bits_to_pack + nbits - 1) / nbits; + + int num_blocks = 0, num_threads = 0; + getNumBlocksAndThreads(len, 1024, num_blocks, num_threads); + + dim3 grid(num_blocks); + dim3 threads(num_threads); + pack<<>>(array_out, array_in, log_modulus, in_len, + len); +} + +template +__host__ void host_integer_compress(cudaStream_t *streams, + uint32_t *gpu_indexes, uint32_t gpu_count, + Torus *glwe_array_out, Torus *lwe_array_in, + Torus **fp_ksk, uint32_t num_lwes, + int_compression *mem_ptr) { + + auto compression_params = mem_ptr->compression_params; + auto input_lwe_dimension = compression_params.small_lwe_dimension; + + // Shift + auto lwe_shifted = mem_ptr->tmp_lwe; + host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted, + lwe_array_in, + (uint64_t)compression_params.message_modulus, + input_lwe_dimension, num_lwes); + + uint32_t lwe_in_size = input_lwe_dimension + 1; + uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) * + compression_params.polynomial_size; + uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1; + + // Keyswitch LWEs to GLWE + auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out; + auto fp_ks_buffer = mem_ptr->fp_ks_buffer; + for (int i = 0; i < num_glwes; i++) { + auto lwe_subset = lwe_shifted + i * lwe_in_size; + auto glwe_out = tmp_glwe_array_out + i * glwe_out_size; + + host_fp_keyswitch_lwe_list_to_glwe( + streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0], + fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension, + compression_params.polynomial_size, compression_params.ks_base_log, + compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe)); + } + + auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe); + + // Modulus switch + host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out, + num_glwes * + (compression_params.glwe_dimension * + compression_params.polynomial_size + + body_count), + mem_ptr->storage_log_modulus); + check_cuda_error(cudaGetLastError()); + + host_pack(streams[0], gpu_indexes[0], glwe_array_out, tmp_glwe_array_out, + num_glwes, body_count, mem_ptr); +} + +template +__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index, + uint32_t log_modulus, uint32_t initial_out_len) { + auto nbits = sizeof(Torus) * 8; + + auto i = threadIdx.x + blockIdx.x * blockDim.x; + + if (i < initial_out_len) { + // Unpack + Torus mask = ((Torus)1 << log_modulus) - 1; + auto start = i * log_modulus; + auto end = (i + 1) * log_modulus; + + auto start_block = start / nbits; + auto start_remainder = start % nbits; + + auto end_block_inclusive = (end - 1) / nbits; + + Torus unpacked_i; + if (start_block == end_block_inclusive) { + auto single_part = array_in[start_block] >> start_remainder; + unpacked_i = single_part & mask; + } else { + auto first_part = array_in[start_block] >> start_remainder; + auto second_part = array_in[start_block + 1] << (nbits - start_remainder); + + unpacked_i = (first_part | second_part) & mask; + } + + // Extract + glwe_array_out[i] = unpacked_i << (nbits - log_modulus); + } +} + +template +__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index, + Torus *glwe_array_out, Torus *array_in, + uint32_t glwe_index, + int_decompression *mem_ptr) { + cudaSetDevice(gpu_index); + + auto params = mem_ptr->compression_params; + + auto log_modulus = mem_ptr->storage_log_modulus; + + uint32_t body_count = mem_ptr->body_count; + auto initial_out_len = + params.glwe_dimension * params.polynomial_size + body_count * body_count; + + // We assure the tail of the glwe is zeroed + auto zeroed_slice = + glwe_array_out + params.glwe_dimension * params.polynomial_size; + cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus), + stream, gpu_index); + + int num_blocks = 0, num_threads = 0; + getNumBlocksAndThreads(initial_out_len, 512, num_blocks, num_threads); + dim3 grid(num_blocks); + dim3 threads(num_threads); + extract<<>>(glwe_array_out, array_in, glwe_index, + log_modulus, initial_out_len); + check_cuda_error(cudaGetLastError()); +} + +template +__host__ void +host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, + uint32_t gpu_count, Torus *lwe_array_out, + Torus *packed_glwe_in, uint32_t *indexes_array, + uint32_t indexes_array_size, void **bsks, + int_decompression *mem_ptr) { + + auto extracted_glwe = mem_ptr->tmp_extracted_glwe; + auto compression_params = mem_ptr->compression_params; + host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0, + mem_ptr); + + auto num_lwes = mem_ptr->body_count; + + // Sample extract + auto extracted_lwe = mem_ptr->tmp_extracted_lwe; + cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe, + extracted_glwe, indexes_array, indexes_array_size, + compression_params.glwe_dimension, + compression_params.polynomial_size); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + auto encryption_params = mem_ptr->encryption_params; + auto carry_extract_lut = mem_ptr->carry_extract_lut; + execute_pbs_async( + streams, gpu_indexes, gpu_count, lwe_array_out, + carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec, + carry_extract_lut->lut_indexes_vec, extracted_lwe, + carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer, + encryption_params.glwe_dimension, + compression_params.glwe_dimension * compression_params.polynomial_size, + encryption_params.polynomial_size, encryption_params.pbs_base_log, + encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes, + encryption_params.pbs_type); +} + +template +__host__ void scratch_cuda_compress_integer_radix_ciphertext_64( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_compression **mem_ptr, uint32_t num_lwes, + int_radix_params compression_params, uint32_t lwe_per_glwe, + uint32_t storage_log_modulus, bool allocate_gpu_memory) { + + *mem_ptr = new int_compression( + streams, gpu_indexes, gpu_count, compression_params, num_lwes, + lwe_per_glwe, storage_log_modulus, allocate_gpu_memory); +} + +template +__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64( + cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, + int_decompression **mem_ptr, uint32_t num_lwes, + int_radix_params encryption_params, int_radix_params compression_params, + uint32_t lwe_per_glwe, uint32_t storage_log_modulus, + bool allocate_gpu_memory) { + + *mem_ptr = new int_decompression( + streams, gpu_indexes, gpu_count, encryption_params, compression_params, + num_lwes, lwe_per_glwe, storage_log_modulus, allocate_gpu_memory); +} +#endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index e935d8768d..cd7640d00b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -765,7 +765,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in, } if (num_radix_blocks % 2 == 1) { - // We couldn't pack the last block, so we just copy it + // We couldn't host_pack the last block, so we just copy it Torus *lsb_block = lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1); Torus *last_block = diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index e595781256..801c0f131a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -133,7 +133,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( PANIC("Unknown operation") } - // pack bits into one block so that we have + // host_pack bits into one block so that we have // control_bit|b|a cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes, streams[0], gpu_indexes[0]); // Do we need this? diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu index 2a936d7cdf..a64c15378d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu @@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( void *cleartext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_cleartext_multiplication(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(cleartext_array_in), - input_lwe_dimension, - input_lwe_ciphertext_count); + host_cleartext_vec_multiplication( + static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); } /* * Perform the multiplication of a u64 input LWE ciphertext vector with a u64 @@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( void *cleartext_array_in, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { - host_cleartext_multiplication(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in), - static_cast(cleartext_array_in), - input_lwe_dimension, - input_lwe_ciphertext_count); + host_cleartext_vec_multiplication( + static_cast(stream), gpu_index, + static_cast(lwe_array_out), + static_cast(lwe_array_in), + static_cast(cleartext_array_in), input_lwe_dimension, + input_lwe_ciphertext_count); } diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh index 9fd9be9419..b9864973a0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh @@ -14,9 +14,10 @@ #include template -__global__ void -cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input, - uint32_t input_lwe_dimension, uint32_t num_entries) { +__global__ void cleartext_vec_multiplication(T *output, T *lwe_input, + T *cleartext_input, + uint32_t input_lwe_dimension, + uint32_t num_entries) { int tid = threadIdx.x; int index = blockIdx.x * blockDim.x + tid; @@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input, } } +template +__host__ void +host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index, + T *output, T *lwe_input, T *cleartext_input, + uint32_t input_lwe_dimension, + uint32_t input_lwe_ciphertext_count) { + + cudaSetDevice(gpu_index); + // lwe_size includes the presence of the body + // whereas lwe_dimension is the number of elements in the mask + int lwe_size = input_lwe_dimension + 1; + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = input_lwe_ciphertext_count * lwe_size; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + cleartext_vec_multiplication<<>>( + output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); + check_cuda_error(cudaGetLastError()); +} + +template +__global__ void +cleartext_multiplication(T *output, T *lwe_input, T cleartext_input, + uint32_t input_lwe_dimension, uint32_t num_entries) { + + int tid = threadIdx.x; + int index = blockIdx.x * blockDim.x + tid; + if (index < num_entries) { + // Here we take advantage of the wrapping behaviour of uint + output[index] = lwe_input[index] * cleartext_input; + } +} + template __host__ void host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index, - T *output, T *lwe_input, T *cleartext_input, + T *output, T *lwe_input, T cleartext_input, uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) { diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index 776880a550..b5f5e43f1e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -102,8 +102,9 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( synchronize_threads_in_block(); // Multiply by the bsk element - polynomial_product_accumulate_by_monomial( - accumulator, bsk_poly, monomial_degree, false); + polynomial_accumulate_monic_monomial_mul( + accumulator, bsk_poly, monomial_degree, threadIdx.x, params::degree, + params::opt, false); } synchronize_threads_in_block(); diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh index a6d7901f20..37a1135346 100644 --- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh @@ -55,21 +55,22 @@ __device__ void polynomial_product_accumulate_in_fourier_domain( } } -// If init_accumulator is set, assumes that result was not initialized and does -// that with the outcome of first * second -template -__device__ void -polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly, - uint64_t monomial_degree, - bool init_accumulator = false) { - // monomial_degree \in [0, 2 * params::degree) - int full_cycles_count = monomial_degree / params::degree; - int remainder_degrees = monomial_degree % params::degree; +// This method expects to work with polynomial_size / compression_params::opt +// threads in the x-block If init_accumulator is set, assumes that result was +// not initialized and does that with the outcome of first * second +template +__device__ void polynomial_accumulate_monic_monomial_mul( + T *result, const T *__restrict__ poly, uint64_t monomial_degree, + uint32_t tid, uint32_t polynomial_size, int coeff_per_thread, + bool init_accumulator = false) { + // monomial_degree \in [0, 2 * compression_params::degree) + int full_cycles_count = monomial_degree / polynomial_size; + int remainder_degrees = monomial_degree % polynomial_size; - int pos = threadIdx.x; - for (int i = 0; i < params::opt; i++) { + int pos = tid; + for (int i = 0; i < coeff_per_thread; i++) { T element = poly[pos]; - int new_pos = (pos + monomial_degree) % params::degree; + int new_pos = (pos + monomial_degree) % polynomial_size; T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient x = SEL(-x, x, new_pos >= remainder_degrees); @@ -78,7 +79,7 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly, result[new_pos] = x; else result[new_pos] += x; - pos += params::degree / params::opt; + pos += polynomial_size / coeff_per_thread; } } diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 11f7c30632..33454af75c 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -311,6 +311,40 @@ extern "C" { num_samples: u32, ); + /// This scratch function allocates the necessary amount of data on the GPU for + /// the public function packing keyswitch implementation on 64-bit + pub fn scratch_fp_keyswitch_lwe_list_to_glwe_64( + stream: *mut c_void, + gpu_index: u32, + fp_ks_buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ); + + /// Perform public functional packing keyswitch on a vector of 64-bit LWE ciphertexts + pub fn cuda_fp_keyswitch_lwe_list_to_glwe_64( + stream: *mut c_void, + gpu_index: u32, + glwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + fp_ksk_array: *const c_void, + fp_ks_buffer: *mut i8, + input_lwe_dimension: u32, + output_glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + num_lwes: u32, + ); + + pub fn cleanup_fp_keyswitch_lwe_list_to_glwe( + stream: *mut c_void, + gpu_index: u32, + fp_ks_buffer: *mut *mut i8, + ); + /// Perform the negation of a u64 input LWE ciphertext vector. /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch /// - `gpu_index` is the index of the GPU to be used in the kernel launch @@ -484,6 +518,82 @@ extern "C" { mem_ptr: *mut *mut i8, ); + pub fn scratch_cuda_integer_compress_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + compression_glwe_dimension: u32, + compression_polynomial_size: u32, + lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + num_lwes: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: u32, + lwe_per_glwe: u32, + storage_log_modulus: u32, + allocate_gpu_memory: bool, + ); + pub fn scratch_cuda_integer_decompress_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + encryption_glwe_dimension: u32, + encryption_polynomial_size: u32, + compression_glwe_dimension: u32, + compression_polynomial_size: u32, + lwe_dimension: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_lwes: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: u32, + lwe_per_glwe: u32, + storage_log_modulus: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_integer_compress_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + glwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + fp_ksk: *const *mut c_void, + num_lwes: u32, + mem_ptr: *mut i8, + ); + + pub fn cuda_integer_decompress_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_out: *mut c_void, + glwe_array_in: *const c_void, + indexes_array: *const c_void, + indexes_array_size: u32, + bsks: *const *mut c_void, + mem_ptr: *mut i8, + ); + + pub fn cleanup_cuda_integer_compress_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); + pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); + pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( streams: *const *mut c_void, gpu_indexes: *const u32, diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs index 6b8b7c1799..749a4a43eb 100644 --- a/tfhe/benches/integer/bench.rs +++ b/tfhe/benches/integer/bench.rs @@ -1126,11 +1126,15 @@ define_server_key_bench_default_fn!( #[cfg(feature = "gpu")] mod cuda { use super::*; - use criterion::criterion_group; + use criterion::{black_box, criterion_group}; use tfhe::core_crypto::gpu::CudaStreams; use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; - use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext; + use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder; + use tfhe::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaUnsignedRadixCiphertext}; + use tfhe::integer::gpu::gen_keys_radix_gpu; use tfhe::integer::gpu::server_key::CudaServerKey; + use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + use tfhe::shortint::prelude::PARAM_MESSAGE_2_CARRY_2_KS_PBS; fn bench_cuda_server_key_unary_function_clean_inputs( c: &mut Criterion, @@ -1400,6 +1404,121 @@ mod cuda { bench_group.finish() } + fn cuda_compress(c: &mut Criterion) { + let bench_name = "integer::cuda::compression"; + let mut bench_group = c.benchmark_group(bench_name); + bench_group + .sample_size(15) + .measurement_time(std::time::Duration::from_secs(30)); + + let stream = CudaStreams::new_multi_gpu(); + + let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS; + let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + + let log_message_modulus = param.message_modulus.0.ilog2() as usize; + + for num_bits in [ + 8, + 16, + 32, + 64, + 128, + 256, + comp_param.lwe_per_glwe.0 * log_message_modulus, + ] { + assert_eq!(num_bits % log_message_modulus, 0); + let num_blocks = num_bits / log_message_modulus; + + // Generate private compression key + let (cks, _) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let private_compression_key = cks.new_compression_private_key(comp_param); + + // Generate and convert compression keys + let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); + let (compressed_compression_key, _) = + radix_cks.new_compressed_compression_decompression_keys(&private_compression_key); + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream); + + // Encrypt + let ct = cks.encrypt_radix(0_u32, num_blocks); + let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); + + // Benchmark + let mut builder = CudaCompressedCiphertextListBuilder::new(); + + builder.push(d_ct, &stream); + + bench_group.bench_function(format!("compress_u{num_bits}"), |b| { + b.iter(|| { + let compressed = builder.build(&cuda_compression_key, &stream); + + _ = black_box(compressed); + }) + }); + } + } + + fn cuda_decompress(c: &mut Criterion) { + let bench_name = "integer::cuda::compression"; + let mut bench_group = c.benchmark_group(bench_name); + bench_group + .sample_size(15) + .measurement_time(std::time::Duration::from_secs(30)); + + let stream = CudaStreams::new_multi_gpu(); + + let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS; + let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + + let log_message_modulus = param.message_modulus.0.ilog2() as usize; + + for num_bits in [ + 8, + 16, + 32, + 64, + 128, + 256, + comp_param.lwe_per_glwe.0 * log_message_modulus, + ] { + assert_eq!(num_bits % log_message_modulus, 0); + let num_blocks = num_bits / log_message_modulus; + + // Generate private compression key + let (cks, _) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let private_compression_key = cks.new_compression_private_key(comp_param); + + // Generate and convert compression keys + let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); + let (compressed_compression_key, compressed_decompression_key) = + radix_cks.new_compressed_compression_decompression_keys(&private_compression_key); + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream); + let cuda_decompression_key = + compressed_decompression_key.decompress_to_cuda(radix_cks.parameters(), &stream); + + // Encrypt + let ct = cks.encrypt_radix(0_u32, num_blocks); + let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); + + // Benchmark + let mut builder = CudaCompressedCiphertextListBuilder::new(); + + builder.push(d_ct, &stream); + + let compressed = builder.build(&cuda_compression_key, &stream); + + bench_group.bench_function(format!("decompress_u{num_bits}"), |b| { + b.iter(|| { + let unpacked: CudaRadixCiphertext = + compressed.get(0, &cuda_decompression_key, &stream); + + _ = black_box(unpacked); + }) + }); + } + } + macro_rules! define_cuda_server_key_bench_clean_input_unary_fn ( (method_name: $server_key_method:ident, display_name:$name:ident) => { ::paste::paste!{ @@ -2052,6 +2171,8 @@ mod cuda { cuda_unsigned_overflowing_scalar_add, ); + criterion_group!(cuda_compress_ops, cuda_compress, cuda_decompress); + fn cuda_bench_server_key_cast_function( c: &mut Criterion, bench_name: &str, @@ -2142,8 +2263,8 @@ mod cuda { #[cfg(feature = "gpu")] use cuda::{ - cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops, - unchecked_cuda_ops, unchecked_scalar_cuda_ops, + cuda_cast_ops, cuda_compress_ops, default_cuda_dedup_ops, default_cuda_ops, + default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops, }; criterion_group!( @@ -2493,6 +2614,7 @@ criterion_group!(oprf, oprf::unsigned_oprf); #[cfg(feature = "gpu")] fn go_through_gpu_bench_groups(val: &str) { + cuda_compress_ops(); match val.to_lowercase().as_str() { "default" => { default_cuda_ops(); diff --git a/tfhe/benches/integer/glwe_packing_compression.rs b/tfhe/benches/integer/glwe_packing_compression.rs index ee3ffec935..16673ef1c7 100644 --- a/tfhe/benches/integer/glwe_packing_compression.rs +++ b/tfhe/benches/integer/glwe_packing_compression.rs @@ -4,7 +4,7 @@ use tfhe::integer::{ClientKey, RadixCiphertext}; use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; -fn glwe_packing(c: &mut Criterion) { +fn cpu_glwe_packing(c: &mut Criterion) { let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; @@ -29,7 +29,6 @@ fn glwe_packing(c: &mut Criterion) { 64, 128, 256, - 256, comp_param.lwe_per_glwe.0 * log_message_modulus, ] { assert_eq!(num_bits % log_message_modulus, 0); @@ -73,9 +72,10 @@ fn glwe_packing(c: &mut Criterion) { } } -criterion_group!(glwe_packing2, glwe_packing); +criterion_group!(cpu_glwe_packing2, cpu_glwe_packing); fn main() { - glwe_packing2(); + cpu_glwe_packing2(); + Criterion::default().configure_from_args().final_summary(); } diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs new file mode 100644 index 0000000000..29cc3d68b1 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs @@ -0,0 +1,36 @@ +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; +use crate::core_crypto::gpu::{packing_keyswitch_list_async, CudaStreams}; +use crate::core_crypto::prelude::{CastInto, UnsignedTorus}; + +/// # Safety +/// +/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not +/// be dropped until stream is synchronised +pub unsafe fn cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async( + lwe_pksk: &CudaLwePackingKeyswitchKey, + input_lwe_ciphertext_list: &CudaLweCiphertextList, + output_glwe_ciphertext: &mut CudaGlweCiphertextList, + streams: &CudaStreams, +) where + // CastInto required for PBS modulus switch which returns a usize + Scalar: UnsignedTorus + CastInto, +{ + let input_lwe_dimension = input_lwe_ciphertext_list.lwe_dimension(); + let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension(); + let output_polynomial_size = output_glwe_ciphertext.polynomial_size(); + + packing_keyswitch_list_async( + streams, + &mut output_glwe_ciphertext.0.d_vec, + &input_lwe_ciphertext_list.0.d_vec, + input_lwe_dimension, + output_glwe_dimension, + output_polynomial_size, + &lwe_pksk.d_vec, + lwe_pksk.decomposition_base_log(), + lwe_pksk.decomposition_level_count(), + input_lwe_ciphertext_list.lwe_ciphertext_count(), + ); +} diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs index ee6ddd2b1d..489bbb1565 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs @@ -1,13 +1,15 @@ +pub mod glwe_sample_extraction; +pub mod lwe_keyswitch; pub mod lwe_linear_algebra; pub mod lwe_multi_bit_programmable_bootstrapping; +pub mod lwe_packing_keyswitch; pub mod lwe_programmable_bootstrapping; -pub mod glwe_sample_extraction; -mod lwe_keyswitch; #[cfg(test)] mod test; pub use lwe_keyswitch::*; pub use lwe_linear_algebra::*; pub use lwe_multi_bit_programmable_bootstrapping::*; +pub use lwe_packing_keyswitch::*; pub use lwe_programmable_bootstrapping::*; diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs new file mode 100644 index 0000000000..1ac6735de6 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs @@ -0,0 +1,230 @@ +use super::*; +use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::CudaStreams; +use serde::de::DeserializeOwned; +use serde::Serialize; + +const NB_TESTS: usize = 10; +fn generate_keys( + params: PackingKeySwitchTestParams, + streams: &CudaStreams, + rsc: &mut TestResources, +) -> CudaPackingKeySwitchKeys { + let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key( + params.lwe_dimension, + &mut rsc.secret_random_generator, + ); + + let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key( + params.glwe_dimension, + params.polynomial_size, + &mut rsc.secret_random_generator, + ); + + let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key( + &lwe_sk, + &glwe_sk, + params.pbs_base_log, + params.pbs_level, + params.glwe_noise_distribution, + params.ciphertext_modulus, + &mut rsc.encryption_random_generator, + ); + + assert!(check_encrypted_content_respects_mod( + &pksk, + params.ciphertext_modulus + )); + + let cuda_pksk = CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(&pksk, streams); + + CudaPackingKeySwitchKeys { + lwe_sk, + glwe_sk, + pksk: cuda_pksk, + } +} + +fn lwe_encrypt_pks_to_glwe_decrypt_custom_mod(params: P) +where + Scalar: UnsignedTorus + CastInto + Serialize + DeserializeOwned, + P: Into>, + PackingKeySwitchTestParams: KeyCacheAccess>, +{ + let params = params.into(); + + let lwe_noise_distribution = params.lwe_noise_distribution; + let ciphertext_modulus = params.ciphertext_modulus; + let message_modulus_log = params.message_modulus_log; + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + + let mut rsc = TestResources::new(); + + let msg_modulus = Scalar::ONE.shl(message_modulus_log.0); + let mut msg = msg_modulus; + let delta: Scalar = encoding_with_padding / msg_modulus; + + let gpu_index = 0; + let stream = CudaStreams::new_single_gpu(gpu_index); + + while msg != Scalar::ZERO { + msg = msg.wrapping_sub(Scalar::ONE); + for _ in 0..NB_TESTS { + let keys = generate_keys(params, &stream, &mut rsc); + let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk); + + let plaintext = Plaintext(msg * delta); + + let input_lwe = allocate_and_encrypt_new_lwe_ciphertext( + &lwe_sk, + plaintext, + lwe_noise_distribution, + ciphertext_modulus, + &mut rsc.encryption_random_generator, + ); + + let d_input_lwe = CudaLweCiphertextList::from_lwe_ciphertext(&input_lwe, &stream); + + assert!(check_encrypted_content_respects_mod( + &input_lwe, + ciphertext_modulus + )); + + let mut d_output_glwe = CudaGlweCiphertextList::new( + glwe_sk.glwe_dimension(), + glwe_sk.polynomial_size(), + GlweCiphertextCount(1), + ciphertext_modulus, + &stream, + ); + + unsafe { + cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async( + &pksk, + &d_input_lwe, + &mut d_output_glwe, + &stream, + ); + } + let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream); + let mut decrypted_plaintext_list = PlaintextList::new( + Scalar::ZERO, + PlaintextCount(output_glwe_list.polynomial_size().0), + ); + + decrypt_glwe_ciphertext_list( + &glwe_sk, + &output_glwe_list, + &mut decrypted_plaintext_list, + ); + let decoded = round_decode(*decrypted_plaintext_list.get(0).0, delta) % msg_modulus; + + assert_eq!(msg, decoded); + } + + // In coverage, we break after one while loop iteration, changing message values does not + // yield higher coverage + #[cfg(tarpaulin)] + break; + } +} + +fn lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod(params: P) +where + Scalar: UnsignedTorus + CastInto + Serialize + DeserializeOwned, + P: Into>, + PackingKeySwitchTestParams: KeyCacheAccess>, +{ + let params = params.into(); + + let lwe_noise_distribution = params.lwe_noise_distribution; + let ciphertext_modulus = params.ciphertext_modulus; + let message_modulus_log = params.message_modulus_log; + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + + let mut rsc = TestResources::new(); + + let msg_modulus = Scalar::ONE.shl(message_modulus_log.0); + let mut msg = msg_modulus; + let delta: Scalar = encoding_with_padding / msg_modulus; + + let gpu_index = 0; + let stream = CudaStreams::new_single_gpu(gpu_index); + + // while msg != Scalar::ZERO { + msg = msg.wrapping_sub(Scalar::ONE); + for _ in 0..NB_TESTS { + let keys = generate_keys(params, &stream, &mut rsc); + let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk); + + let mut input_lwe_list = LweCiphertextList::new( + Scalar::ZERO, + lwe_sk.lwe_dimension().to_lwe_size(), + LweCiphertextCount(glwe_sk.polynomial_size().0), + ciphertext_modulus, + ); + + let mut input_plaintext_list = + PlaintextList::new(msg * delta, PlaintextCount(glwe_sk.polynomial_size().0)); + + encrypt_lwe_ciphertext_list( + &lwe_sk, + &mut input_lwe_list, + &input_plaintext_list, + lwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + + let d_input_lwe_list = + CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream); + + assert!(check_encrypted_content_respects_mod( + &input_lwe_list, + ciphertext_modulus + )); + + let mut d_output_glwe = CudaGlweCiphertextList::new( + glwe_sk.glwe_dimension(), + glwe_sk.polynomial_size(), + GlweCiphertextCount(1), + ciphertext_modulus, + &stream, + ); + + unsafe { + cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async( + &pksk, + &d_input_lwe_list, + &mut d_output_glwe, + &stream, + ); + } + + let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream); + + let mut decrypted_plaintext_list = PlaintextList::new( + Scalar::ZERO, + PlaintextCount(output_glwe_list.polynomial_size().0), + ); + + decrypt_glwe_ciphertext_list(&glwe_sk, &output_glwe_list, &mut decrypted_plaintext_list); + + decrypted_plaintext_list + .iter_mut() + .for_each(|x| *x.0 = round_decode(*x.0, delta) % msg_modulus); + input_plaintext_list.iter_mut().for_each(|x| *x.0 /= delta); + + assert_eq!(decrypted_plaintext_list, input_plaintext_list); + } + + // In coverage, we break after one while loop iteration, changing message values does not + // yield higher coverage + #[cfg(tarpaulin)] + break; + // } +} + +create_gpu_parametrized_test!(lwe_encrypt_pks_to_glwe_decrypt_custom_mod); +create_gpu_parametrized_test!(lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod); diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs index 59ba20af18..779a419702 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs @@ -4,8 +4,15 @@ mod glwe_sample_extraction; mod lwe_keyswitch; mod lwe_linear_algebra; mod lwe_multi_bit_programmable_bootstrapping; +mod lwe_packing_keyswitch; mod lwe_programmable_bootstrapping; +pub struct CudaPackingKeySwitchKeys { + pub lwe_sk: LweSecretKey>, + pub glwe_sk: GlweSecretKey>, + pub pksk: CudaLwePackingKeyswitchKey, +} + // Macro to generate tests for all parameter sets macro_rules! create_gpu_parametrized_test{ ($name:ident { $($param:ident),* }) => { @@ -47,4 +54,5 @@ macro_rules! create_gpu_multi_bit_parametrized_test{ }; } +use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; use {create_gpu_multi_bit_parametrized_test, create_gpu_parametrized_test}; diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs index 9defa579f6..536ab06a98 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs @@ -199,51 +199,6 @@ impl CudaLweCiphertextList { LweCiphertext::from_container(container, self.ciphertext_modulus()) } - /// ```rust - /// use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; - /// use tfhe::core_crypto::gpu::CudaStreams; - /// use tfhe::core_crypto::prelude::{ - /// CiphertextModulus, LweCiphertextCount, LweCiphertextList, LweSize, - /// }; - /// - /// let mut streams = CudaStreams::new_single_gpu(0); - /// - /// let lwe_size = LweSize(743); - /// let ciphertext_modulus = CiphertextModulus::new_native(); - /// let lwe_ciphertext_count = LweCiphertextCount(2); - /// - /// // Create a new LweCiphertextList - /// let lwe_list = LweCiphertextList::new(0u64, lwe_size, lwe_ciphertext_count, ciphertext_modulus); - /// - /// // Copy to GPU - /// let d_lwe_list = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, &mut streams); - /// let d_lwe_list_copied = d_lwe_list.duplicate(&mut streams); - /// - /// let lwe_list_copied = d_lwe_list_copied.to_lwe_ciphertext_list(&mut streams); - /// - /// assert_eq!(lwe_list, lwe_list_copied); - /// ``` - pub fn duplicate(&self, streams: &CudaStreams) -> Self { - let lwe_dimension = self.lwe_dimension(); - let lwe_ciphertext_count = self.lwe_ciphertext_count(); - let ciphertext_modulus = self.ciphertext_modulus(); - - // Copy to the GPU - let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams, 0); - unsafe { - d_vec.copy_from_gpu_async(&self.0.d_vec, streams, 0); - } - streams.synchronize(); - - let cuda_lwe_list = CudaLweList { - d_vec, - lwe_ciphertext_count, - lwe_dimension, - ciphertext_modulus, - }; - Self(cuda_lwe_list) - } - pub(crate) fn lwe_dimension(&self) -> LweDimension { self.0.lwe_dimension } diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs new file mode 100644 index 0000000000..0c6664aa59 --- /dev/null +++ b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs @@ -0,0 +1,84 @@ +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::{convert_lwe_keyswitch_key_async, CudaStreams}; +use crate::core_crypto::prelude::{ + lwe_packing_keyswitch_key_input_key_element_encrypted_size, CiphertextModulus, + DecompositionBaseLog, DecompositionLevelCount, GlweSize, LweDimension, + LwePackingKeyswitchKeyOwned, PolynomialSize, UnsignedInteger, +}; + +#[derive(Debug)] +pub struct CudaLwePackingKeyswitchKey { + pub(crate) d_vec: CudaVec, + decomp_base_log: DecompositionBaseLog, + decomp_level_count: DecompositionLevelCount, + output_glwe_size: GlweSize, + output_polynomial_size: PolynomialSize, + ciphertext_modulus: CiphertextModulus, +} + +impl CudaLwePackingKeyswitchKey { + pub fn from_lwe_packing_keyswitch_key( + h_ksk: &LwePackingKeyswitchKeyOwned, + streams: &CudaStreams, + ) -> Self { + let decomp_base_log = h_ksk.decomposition_base_log(); + let decomp_level_count = h_ksk.decomposition_level_count(); + let input_lwe_size = h_ksk.input_key_lwe_dimension().to_lwe_size(); + let output_glwe_size = h_ksk.output_key_glwe_dimension().to_glwe_size(); + let output_polynomial_size = h_ksk.output_polynomial_size(); + let ciphertext_modulus = h_ksk.ciphertext_modulus(); + + // Allocate memory + let mut d_vec = CudaVec::::new_multi_gpu( + input_lwe_size.to_lwe_dimension().0 + * lwe_packing_keyswitch_key_input_key_element_encrypted_size( + decomp_level_count, + output_glwe_size, + output_polynomial_size, + ), + streams, + ); + + unsafe { + convert_lwe_keyswitch_key_async(streams, &mut d_vec, h_ksk.as_ref()); + } + + streams.synchronize(); + + Self { + d_vec, + decomp_base_log, + decomp_level_count, + output_glwe_size, + output_polynomial_size, + ciphertext_modulus, + } + } + + pub(crate) fn decomposition_base_log(&self) -> DecompositionBaseLog { + self.decomp_base_log + } + pub(crate) fn decomposition_level_count(&self) -> DecompositionLevelCount { + self.decomp_level_count + } + + pub(crate) fn output_glwe_size(&self) -> GlweSize { + self.output_glwe_size + } + pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus { + self.ciphertext_modulus + } + pub(crate) fn output_polynomial_size(&self) -> PolynomialSize { + self.output_polynomial_size + } + pub fn input_key_lwe_dimension(&self) -> LweDimension { + LweDimension( + self.d_vec.len + / lwe_packing_keyswitch_key_input_key_element_encrypted_size( + self.decomp_level_count, + self.output_glwe_size, + self.output_polynomial_size, + ), + ) + } +} diff --git a/tfhe/src/core_crypto/gpu/entities/mod.rs b/tfhe/src/core_crypto/gpu/entities/mod.rs index 34dcb5f03e..623bf42497 100644 --- a/tfhe/src/core_crypto/gpu/entities/mod.rs +++ b/tfhe/src/core_crypto/gpu/entities/mod.rs @@ -3,3 +3,4 @@ pub mod lwe_bootstrap_key; pub mod lwe_ciphertext_list; pub mod lwe_keyswitch_key; pub mod lwe_multi_bit_bootstrap_key; +pub mod lwe_packing_keyswitch_key; diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 3341ba6e5e..a44d68b0df 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -13,7 +13,6 @@ pub use algorithms::*; pub use entities::*; use std::ffi::c_void; pub(crate) use tfhe_cuda_backend::cuda_bind::*; - #[derive(Debug)] pub struct CudaStreams { pub ptr: Vec<*mut c_void>, @@ -260,6 +259,56 @@ pub unsafe fn convert_lwe_keyswitch_key_async( dest.copy_from_cpu_multi_gpu_async(src, streams); } +/// Discarding packing keyswitch on a vector of LWE ciphertexts +/// +/// # Safety +/// +/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is +/// required +#[allow(clippy::too_many_arguments)] +pub unsafe fn packing_keyswitch_list_async( + streams: &CudaStreams, + glwe_array_out: &mut CudaVec, + lwe_array_in: &CudaVec, + input_lwe_dimension: LweDimension, + output_glwe_dimension: GlweDimension, + output_polynomial_size: PolynomialSize, + fp_keyswitch_key: &CudaVec, + base_log: DecompositionBaseLog, + l_gadget: DecompositionLevelCount, + num_lwes: LweCiphertextCount, +) { + let mut fp_ks_buffer: *mut i8 = std::ptr::null_mut(); + scratch_fp_keyswitch_lwe_list_to_glwe_64( + streams.ptr[0], + streams.gpu_indexes[0], + std::ptr::addr_of_mut!(fp_ks_buffer), + output_glwe_dimension.0 as u32, + output_polynomial_size.0 as u32, + num_lwes.0 as u32, + true, + ); + cuda_fp_keyswitch_lwe_list_to_glwe_64( + streams.ptr[0], + streams.gpu_indexes[0], + glwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), + fp_keyswitch_key.as_c_ptr(0), + fp_ks_buffer, + input_lwe_dimension.0 as u32, + output_glwe_dimension.0 as u32, + output_polynomial_size.0 as u32, + base_log.0 as u32, + l_gadget.0 as u32, + num_lwes.0 as u32, + ); + cleanup_fp_keyswitch_lwe_list_to_glwe( + streams.ptr[0], + streams.gpu_indexes[0], + std::ptr::addr_of_mut!(fp_ks_buffer), + ); +} + /// Convert programmable bootstrap key /// /// # Safety diff --git a/tfhe/src/integer/client_key/radix.rs b/tfhe/src/integer/client_key/radix.rs index 36333ddbc2..c68f175a38 100644 --- a/tfhe/src/integer/client_key/radix.rs +++ b/tfhe/src/integer/client_key/radix.rs @@ -6,6 +6,9 @@ use crate::integer::backward_compatibility::client_key::RadixClientKeyVersions; use crate::integer::block_decomposition::{DecomposableInto, RecomposableFrom}; use crate::integer::ciphertext::{RadixCiphertext, SignedRadixCiphertext}; use crate::integer::BooleanBlock; +use crate::shortint::list_compression::{ + CompressedCompressionKey, CompressedDecompressionKey, CompressionPrivateKeys, +}; use crate::shortint::{Ciphertext as ShortintCiphertext, PBSParameters as ShortintParameters}; use serde::{Deserialize, Serialize}; use tfhe_versionable::Versionize; @@ -131,6 +134,16 @@ impl RadixClientKey { pub fn num_blocks(&self) -> usize { self.num_blocks } + + //Todo Can we remove this? + pub fn new_compressed_compression_decompression_keys( + &self, + private_compression_key: &CompressionPrivateKeys, + ) -> (CompressedCompressionKey, CompressedDecompressionKey) { + self.key + .key + .new_compressed_compression_decompression_keys(private_compression_key) + } } impl From<(ClientKey, usize)> for RadixClientKey { diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs new file mode 100644 index 0000000000..11edc22c8e --- /dev/null +++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs @@ -0,0 +1,208 @@ +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::prelude::CiphertextModulusLog; +use crate::integer::ciphertext::DataKind; +use crate::integer::gpu::ciphertext::info::CudaBlockInfo; +use crate::integer::gpu::ciphertext::{ + CudaRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext, +}; +use crate::integer::gpu::list_compression::server_keys::{ + CudaCompressionKey, CudaDecompressionKey, +}; + +pub struct CudaCompressedCiphertextList { + pub(crate) packed_list: ( + CudaGlweCiphertextList, + Vec, + CiphertextModulusLog, + ), + info: Vec, +} +impl CudaCompressedCiphertextList { + pub fn len(&self) -> usize { + self.info.len() + } + + pub fn is_empty(&self) -> bool { + self.info.len() == 0 + } + + pub fn get( + &self, + index: usize, + decomp_key: &CudaDecompressionKey, + streams: &CudaStreams, + ) -> CudaRadixCiphertext +where { + let preceding_infos = self.info.get(..index).unwrap(); + let current_info = self.info.get(index).copied().unwrap(); + + let start_block_index: usize = preceding_infos + .iter() + .copied() + .map(DataKind::num_blocks) + .sum(); + + let end_block_index = start_block_index + current_info.num_blocks() - 1; + + decomp_key.unpack( + &self.packed_list, + start_block_index, + end_block_index, + streams, + ) + } +} + +pub trait CudaCompressible { + fn compress_into( + self, + messages: &mut Vec, + streams: &CudaStreams, + ) -> DataKind; +} + +impl CudaCompressible for CudaSignedRadixCiphertext { + fn compress_into( + self, + messages: &mut Vec, + streams: &CudaStreams, + ) -> DataKind { + let x = self.ciphertext.duplicate(streams); + + let copy = x.duplicate(streams); + messages.push(copy); + + let num_blocks = x.d_blocks.lwe_ciphertext_count().0; + DataKind::Signed(num_blocks) + } +} +impl CudaCompressible for CudaUnsignedRadixCiphertext { + fn compress_into( + self, + messages: &mut Vec, + streams: &CudaStreams, + ) -> DataKind { + let x = self.ciphertext.duplicate(streams); + + let copy = x.duplicate(streams); + messages.push(copy); + + let num_blocks = x.d_blocks.lwe_ciphertext_count().0; + + DataKind::Unsigned(num_blocks) + } +} + +pub struct CudaCompressedCiphertextListBuilder { + pub(crate) ciphertexts: Vec, + pub(crate) info: Vec, +} + +impl CudaCompressedCiphertextListBuilder { + #[allow(clippy::new_without_default)] + pub fn new() -> Self { + Self { + ciphertexts: vec![], + info: vec![], + } + } + + pub fn push(&mut self, data: T, streams: &CudaStreams) -> &mut Self { + let kind = data.compress_into(&mut self.ciphertexts, streams); + + if kind.num_blocks() != 0 { + self.info.push(kind); + } + + self + } + + pub fn build( + &self, + comp_key: &CudaCompressionKey, + streams: &CudaStreams, + ) -> CudaCompressedCiphertextList { + let packed_list = comp_key.compress_ciphertexts_into_list(&self.ciphertexts, streams); + + CudaCompressedCiphertextList { + packed_list, + info: self.info.clone(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::integer::gpu::gen_keys_radix_gpu; + use crate::integer::ClientKey; + use crate::shortint::parameters::list_compression + ::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; + use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; + + #[test] + fn test_gpu_ciphertext_compression() { + let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); + + let private_compression_key = + cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); + + let gpu_index = 0; + let streams = CudaStreams::new_single_gpu(gpu_index); + + let num_blocks = 4; + let (radix_cks, _) = gen_keys_radix_gpu( + PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64, + num_blocks, + &streams, + ); + + let (compressed_compression_key, compressed_decompression_key) = + radix_cks.new_compressed_compression_decompression_keys(&private_compression_key); + + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams); + let cuda_decompression_key = + compressed_decompression_key.decompress_to_cuda(radix_cks.parameters(), &streams); + + let ct1 = radix_cks.encrypt(3_u32); + let ct2 = radix_cks.encrypt(2_u32); + let ct3 = radix_cks.encrypt_signed(-2); + + // Copy to GPU + let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams); + let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &streams); + let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &streams); + + let cuda_compressed = CudaCompressedCiphertextListBuilder::new() + .push(d_ct1, &streams) + .push(d_ct2, &streams) + .push(d_ct3, &streams) + .build(&cuda_compression_key, &streams); + + let d_decompressed1 = CudaUnsignedRadixCiphertext { + ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams), + }; + + let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams); + let decrypted: u32 = radix_cks.decrypt(&decompressed1); + + assert_eq!(decrypted, 3_u32); + let d_decompressed2 = CudaUnsignedRadixCiphertext { + ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams), + }; + + let decompressed2 = d_decompressed2.to_radix_ciphertext(&streams); + let decrypted: u32 = radix_cks.decrypt(&decompressed2); + + assert_eq!(decrypted, 2_u32); + let d_decompressed3 = CudaSignedRadixCiphertext { + ciphertext: cuda_compressed.get(2, &cuda_decompression_key, &streams), + }; + + let decompressed3 = d_decompressed3.to_signed_radix_ciphertext(&streams); + let decrypted: i32 = radix_cks.decrypt_signed(&decompressed3); + + assert_eq!(decrypted, -2); + } +} diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs index 7f0408eddb..51e702d0c8 100644 --- a/tfhe/src/integer/gpu/ciphertext/mod.rs +++ b/tfhe/src/integer/gpu/ciphertext/mod.rs @@ -1,4 +1,5 @@ pub mod boolean_value; +pub mod compressed_ciphertext_list; pub mod info; use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; diff --git a/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs b/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs new file mode 100644 index 0000000000..0a89c6efba --- /dev/null +++ b/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs @@ -0,0 +1,51 @@ +use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey; +use crate::core_crypto::gpu::CudaStreams; +use crate::integer::gpu::list_compression::server_keys::{ + CudaCompressionKey, CudaDecompressionKey, +}; +use crate::integer::gpu::server_key::CudaBootstrappingKey; +use crate::shortint::list_compression::{ + CompressedCompressionKey, CompressedDecompressionKey, CompressionKey, +}; +use crate::shortint::PBSParameters; + +impl CompressedDecompressionKey { + pub fn decompress_to_cuda( + &self, + parameters: PBSParameters, + streams: &CudaStreams, + ) -> CudaDecompressionKey { + let h_bootstrap_key = self + .blind_rotate_key + .as_view() + .par_decompress_into_lwe_bootstrap_key(); + + let d_bootstrap_key = + CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams); + + let blind_rotate_key = CudaBootstrappingKey::Classic(d_bootstrap_key); + + CudaDecompressionKey { + blind_rotate_key, + lwe_per_glwe: self.lwe_per_glwe, + parameters, + } + } +} + +impl CompressedCompressionKey { + pub fn decompress_to_cuda(&self, streams: &CudaStreams) -> CudaCompressionKey { + let packing_key_switching_key = self + .packing_key_switching_key + .as_view() + .decompress_into_lwe_packing_keyswitch_key(); + + let glwe_compression_key = CompressionKey { + packing_key_switching_key, + lwe_per_glwe: self.lwe_per_glwe, + storage_log_modulus: self.storage_log_modulus, + }; + + CudaCompressionKey::from_compression_key(&glwe_compression_key, streams) + } +} diff --git a/tfhe/src/integer/gpu/list_compression/mod.rs b/tfhe/src/integer/gpu/list_compression/mod.rs new file mode 100644 index 0000000000..8cb3b2e38c --- /dev/null +++ b/tfhe/src/integer/gpu/list_compression/mod.rs @@ -0,0 +1,2 @@ +pub mod compressed_server_keys; +pub mod server_keys; diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs new file mode 100644 index 0000000000..bfa1d99575 --- /dev/null +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -0,0 +1,239 @@ +use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::vec::CudaVec; +use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::prelude::{CiphertextModulusLog, GlweCiphertextCount, LweCiphertextCount}; +use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; +use crate::integer::gpu::ciphertext::CudaRadixCiphertext; +use crate::integer::gpu::server_key::CudaBootstrappingKey; +use crate::integer::gpu::{ + compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async, +}; +use crate::shortint::list_compression::CompressionKey; +use crate::shortint::PBSParameters; +use itertools::Itertools; + +#[derive(Debug)] +pub struct CudaCompressionKey { + pub packing_key_switching_key: CudaLwePackingKeyswitchKey, + pub lwe_per_glwe: LweCiphertextCount, + pub storage_log_modulus: CiphertextModulusLog, +} + +pub struct CudaDecompressionKey { + pub blind_rotate_key: CudaBootstrappingKey, + pub lwe_per_glwe: LweCiphertextCount, + pub parameters: PBSParameters, +} + +impl CudaCompressionKey { + pub fn from_compression_key(compression_key: &CompressionKey, streams: &CudaStreams) -> Self { + Self { + packing_key_switching_key: CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key( + &compression_key.packing_key_switching_key, + streams, + ), + lwe_per_glwe: compression_key.lwe_per_glwe, + storage_log_modulus: compression_key.storage_log_modulus, + } + } + + unsafe fn flatten_async( + vec_ciphertexts: &Vec, + streams: &CudaStreams, + ) -> (CudaLweCiphertextList, Vec) { + let first_ct = &vec_ciphertexts.first().unwrap().d_blocks; + + // We assume all ciphertexts will have the same lwe dimension + let lwe_dimension = first_ct.lwe_dimension(); + let ciphertext_modulus = first_ct.ciphertext_modulus(); + + // Compute total number of lwe ciphertexts we will be handling + let total_num_blocks: usize = vec_ciphertexts + .iter() + .map(|x| x.d_blocks.lwe_ciphertext_count().0) + .sum(); + + let lwe_ciphertext_count = LweCiphertextCount(total_num_blocks); + + let gpu_index = streams.gpu_indexes[0]; + let d_vec = unsafe { + let mut d_vec = CudaVec::new_async( + lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0, + streams, + gpu_index, + ); + let mut offset: usize = 0; + for ciphertext in vec_ciphertexts { + // Todo: We might use copy_self_range_gpu_to_gpu_async here + let dest_ptr = d_vec + .as_mut_c_ptr(gpu_index) + .add(offset * std::mem::size_of::()); + let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::(); + cuda_memcpy_async_gpu_to_gpu( + dest_ptr, + ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index), + size as u64, + streams.ptr[gpu_index as usize], + streams.gpu_indexes[gpu_index as usize], + ); + + offset += ciphertext.d_blocks.0.d_vec.len; + } + + streams.synchronize(); + d_vec + }; + + let flattened_ciphertexts = + CudaLweCiphertextList::from_cuda_vec(d_vec, lwe_ciphertext_count, ciphertext_modulus); + + let info = vec_ciphertexts + .iter() + .flat_map(|x| x.info.blocks.clone()) + .collect_vec(); + + (flattened_ciphertexts, info) + } + + pub fn compress_ciphertexts_into_list( + &self, + ciphertexts: &Vec, + streams: &CudaStreams, + ) -> ( + CudaGlweCiphertextList, + Vec, + CiphertextModulusLog, + ) { + let lwe_pksk = &self.packing_key_switching_key; + + let ciphertext_modulus = lwe_pksk.ciphertext_modulus(); + let compress_polynomial_size = lwe_pksk.output_polynomial_size(); + let compress_glwe_size = lwe_pksk.output_glwe_size(); + + let first_ct = ciphertexts.first().unwrap(); + let first_ct_info = first_ct.info.blocks.first().unwrap(); + let message_modulus = first_ct_info.message_modulus; + let carry_modulus = first_ct_info.carry_modulus; + + let lwe_dimension = first_ct.d_blocks.lwe_dimension(); + + let num_lwes: usize = ciphertexts + .iter() + .map(|x| x.d_blocks.lwe_ciphertext_count().0) + .sum(); + + let mut output_glwe = CudaGlweCiphertextList::new( + compress_glwe_size.to_glwe_dimension(), + compress_polynomial_size, + GlweCiphertextCount(ciphertexts.len()), + ciphertext_modulus, + streams, + ); + + let info = unsafe { + let (input_lwes, info) = Self::flatten_async(ciphertexts, streams); + + compress_integer_radix_async( + streams, + &mut output_glwe.0.d_vec, + &input_lwes.0.d_vec, + &self.packing_key_switching_key.d_vec, + message_modulus, + carry_modulus, + compress_glwe_size.to_glwe_dimension(), + compress_polynomial_size, + lwe_dimension, + lwe_pksk.decomposition_base_log(), + lwe_pksk.decomposition_level_count(), + self.lwe_per_glwe.0 as u32, + self.storage_log_modulus.0 as u32, + num_lwes as u32, + ); + + info + }; + + (output_glwe, info, self.storage_log_modulus) + } +} + +impl CudaDecompressionKey { + pub fn unpack( + &self, + packed_list: &( + CudaGlweCiphertextList, + Vec, + CiphertextModulusLog, + ), + start_block_index: usize, + end_block_index: usize, + streams: &CudaStreams, + ) -> CudaRadixCiphertext { + let indexes_array = (start_block_index..=end_block_index) + .map(|x| x as u32) + .collect_vec(); + + let encryption_glwe_dimension = self.parameters.glwe_dimension(); + let encryption_polynomial_size = self.parameters.polynomial_size(); + let compression_glwe_dimension = packed_list.0.glwe_dimension(); + let compression_polynomial_size = packed_list.0.polynomial_size(); + let lwe_ciphertext_count = LweCiphertextCount(end_block_index - start_block_index + 1); + let message_modulus = self.parameters.message_modulus(); + let carry_modulus = self.parameters.carry_modulus(); + let ciphertext_modulus = self.parameters.ciphertext_modulus(); + let storage_log_modulus = packed_list.2; + + match &self.blind_rotate_key { + CudaBootstrappingKey::Classic(bsk) => { + let lwe_dimension = bsk.output_lwe_dimension(); + + let mut output_lwe = CudaLweCiphertextList::new( + lwe_dimension, + lwe_ciphertext_count, + ciphertext_modulus, + streams, + ); + + let gpu_index = streams.gpu_indexes[0]; + unsafe { + let d_indexes_array = + CudaVec::from_cpu_async(indexes_array.as_slice(), streams, gpu_index); + + decompress_integer_radix_async( + streams, + &mut output_lwe.0.d_vec, + &packed_list.0 .0.d_vec, + &bsk.d_vec, + message_modulus, + carry_modulus, + encryption_glwe_dimension, + encryption_polynomial_size, + compression_glwe_dimension, + compression_polynomial_size, + lwe_dimension, + bsk.decomp_base_log(), + bsk.decomp_level_count(), + self.lwe_per_glwe.0 as u32, + storage_log_modulus.0 as u32, + &d_indexes_array, + lwe_ciphertext_count.0 as u32, + ); + } + + streams.synchronize(); + + CudaRadixCiphertext { + d_blocks: output_lwe, + info: CudaRadixCiphertextInfo { + blocks: packed_list.1.clone(), + }, + } + } + CudaBootstrappingKey::MultiBit(_) => { + panic! {"Unsupported PBS"} + } + } + } +} diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index e4545da735..01c25d0910 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -1,6 +1,6 @@ pub mod ciphertext; +pub mod list_compression; pub mod server_key; - use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut}; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::gpu::CudaStreams; @@ -12,6 +12,7 @@ use crate::integer::{ClientKey, RadixClientKey}; use crate::shortint::{CarryModulus, MessageModulus}; pub use server_key::CudaServerKey; use std::cmp::min; + use tfhe_cuda_backend::cuda_bind::*; #[repr(u32)] @@ -273,6 +274,153 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async( + streams: &CudaStreams, + glwe_array_out: &mut CudaVec, + lwe_array_in: &CudaVec, + fp_keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + compression_glwe_dimension: GlweDimension, + compression_polynomial_size: PolynomialSize, + lwe_dimension: LweDimension, + ks_base_log: DecompositionBaseLog, + ks_level: DecompositionLevelCount, + lwe_per_glwe: u32, + storage_log_modulus: u32, + num_blocks: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + lwe_array_in.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + fp_keyswitch_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + scratch_cuda_integer_compress_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + compression_glwe_dimension.0 as u32, + compression_polynomial_size.0 as u32, + lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + PBSType::Classical as u32, + lwe_per_glwe, + storage_log_modulus, + true, + ); + + cuda_integer_compress_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + glwe_array_out.as_mut_c_ptr(0), + lwe_array_in.as_c_ptr(0), + fp_keyswitch_key.ptr.as_ptr(), + num_blocks, + mem_ptr, + ); + + cleanup_cuda_integer_compress_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization +/// is required +pub unsafe fn decompress_integer_radix_async( + streams: &CudaStreams, + lwe_array_out: &mut CudaVec, + glwe_in: &CudaVec, + bootstrapping_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + encryption_glwe_dimension: GlweDimension, + encryption_polynomial_size: PolynomialSize, + compression_glwe_dimension: GlweDimension, + compression_polynomial_size: PolynomialSize, + lwe_dimension: LweDimension, + pbs_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + lwe_per_glwe: u32, + storage_log_modulus: u32, + vec_indexes: &CudaVec, + num_blocks: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + glwe_in.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + bootstrapping_key.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + scratch_cuda_integer_decompress_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + encryption_glwe_dimension.0 as u32, + encryption_polynomial_size.0 as u32, + compression_glwe_dimension.0 as u32, + compression_polynomial_size.0 as u32, + lwe_dimension.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + 0, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + PBSType::Classical as u32, + lwe_per_glwe, + storage_log_modulus, + true, + ); + + cuda_integer_decompress_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + lwe_array_out.as_mut_c_ptr(0), + glwe_in.as_c_ptr(0), + vec_indexes.as_c_ptr(0), + vec_indexes.len as u32, + bootstrapping_key.ptr.as_ptr(), + mem_ptr, + ); + + cleanup_cuda_integer_decompress_radix_ciphertext_64( + streams.ptr.as_ptr(), + streams.gpu_indexes.as_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +} + #[allow(clippy::too_many_arguments)] /// # Safety ///