From 01a1ff1cb92b730bcb70a184f1b21115d9fb269a Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 16 Jan 2025 18:17:10 +0100 Subject: [PATCH] chore(gpu): start using a struct to pass data across rust/c++ --- .../tfhe-cuda-backend/cuda/include/device.h | 3 + .../cuda/include/integer/integer.h | 21 +- .../cuda/include/integer/integer_utilities.h | 47 +-- .../cuda/include/integer/radix_ciphertext.h | 31 ++ .../cuda/include/linear_algebra.h | 16 +- backends/tfhe-cuda-backend/cuda/src/device.cu | 7 + .../tfhe-cuda-backend/cuda/src/integer/abs.cu | 9 +- .../cuda/src/integer/abs.cuh | 60 +++- .../cuda/src/integer/cmux.cu | 19 +- .../cuda/src/integer/cmux.cuh | 91 +++++- .../cuda/src/integer/comparison.cuh | 2 +- .../cuda/src/integer/div_rem.cuh | 36 +-- .../cuda/src/integer/integer.cuh | 153 ++++------ .../cuda/src/integer/multiplication.cuh | 12 +- .../cuda/src/integer/radix_ciphertext.cu | 87 ++++++ .../cuda/src/integer/scalar_comparison.cuh | 10 +- .../cuda/src/integer/shift_and_rotate.cuh | 12 +- .../cuda/src/linearalgebra/addition.cu | 29 +- .../cuda/src/linearalgebra/addition.cuh | 39 ++- backends/tfhe-cuda-backend/src/bindings.rs | 277 ++++++++++-------- tfhe/src/core_crypto/gpu/mod.rs | 67 ++++- tfhe/src/integer/gpu/ciphertext/info.rs | 17 -- tfhe/src/integer/gpu/mod.rs | 128 ++++++-- tfhe/src/integer/gpu/server_key/radix/abs.rs | 122 ++++---- tfhe/src/integer/gpu/server_key/radix/add.rs | 55 ++-- .../gpu/server_key/radix/bitwise_op.rs | 26 +- tfhe/src/integer/gpu/server_key/radix/cmux.rs | 168 +++++------ .../gpu/server_key/radix/comparison.rs | 64 ++-- .../integer/gpu/server_key/radix/div_mod.rs | 40 ++- .../src/integer/gpu/server_key/radix/ilog2.rs | 12 +- tfhe/src/integer/gpu/server_key/radix/mod.rs | 97 +++--- tfhe/src/integer/gpu/server_key/radix/mul.rs | 8 +- tfhe/src/integer/gpu/server_key/radix/neg.rs | 2 +- .../integer/gpu/server_key/radix/rotate.rs | 32 +- .../gpu/server_key/radix/scalar_add.rs | 28 +- .../gpu/server_key/radix/scalar_bitwise_op.rs | 6 +- .../gpu/server_key/radix/scalar_comparison.rs | 16 +- .../gpu/server_key/radix/scalar_div_mod.rs | 12 +- .../gpu/server_key/radix/scalar_mul.rs | 2 +- .../gpu/server_key/radix/scalar_rotate.rs | 12 +- .../gpu/server_key/radix/scalar_shift.rs | 20 +- .../gpu/server_key/radix/scalar_sub.rs | 4 +- .../src/integer/gpu/server_key/radix/shift.rs | 32 +- tfhe/src/integer/gpu/server_key/radix/sub.rs | 26 +- .../server_key/radix/vector_comparisons.rs | 24 +- .../gpu/server_key/radix/vector_find.rs | 90 ++---- .../radix_parallel/tests_long_run/mod.rs | 2 +- tfhe/src/shortint/ciphertext/common.rs | 4 +- 48 files changed, 1152 insertions(+), 925 deletions(-) create mode 100644 backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h create mode 100644 backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index 431b725360..6af6472693 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -42,6 +42,9 @@ void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index); void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index); +void synchronize_streams(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count); + uint32_t cuda_is_available(); void *cuda_malloc(uint64_t size, uint32_t gpu_index); diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index 68d6cb02dc..f101c381f8 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -38,6 +38,15 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 }; enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 }; extern "C" { + +typedef struct { + void *ptr; + uint64_t *degrees; + uint64_t *noise_levels; + uint32_t num_radix_blocks; + uint32_t lwe_dimension; +} CudaRadixCiphertextFFI; + void scratch_cuda_apply_univariate_lut_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, @@ -258,9 +267,11 @@ void scratch_cuda_integer_radix_cmux_kb_64( void cuda_cmux_integer_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true, - void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks, - void *const *ksks, uint32_t lwe_ciphertext_count); + CudaRadixCiphertextFFI *lwe_array_out, + CudaRadixCiphertextFFI const *lwe_condition, + CudaRadixCiphertextFFI const *lwe_array_true, + CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr, + void *const *bsks, void *const *ksks); void cleanup_cuda_integer_radix_cmux(void *const *streams, uint32_t const *gpu_indexes, @@ -439,8 +450,8 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64( void cuda_integer_abs_inplace_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks, - void *const *ksks, uint32_t num_blocks); + CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed, + void *const *bsks, void *const *ksks); void cleanup_cuda_integer_abs_inplace(void *const *streams, uint32_t const *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 186b8c7ce2..b20c8fc49e 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -4,6 +4,7 @@ #include "integer.h" #include "keyswitch.h" #include "pbs/programmable_bootstrap.cuh" +#include "radix_ciphertext.h" #include #include #include @@ -2963,9 +2964,9 @@ template struct int_cmux_buffer { int_radix_lut *predicate_lut; int_radix_lut *message_extract_lut; - Torus *buffer_in; - Torus *buffer_out; - Torus *condition_array; + CudaRadixCiphertextFFI *buffer_in = new CudaRadixCiphertextFFI; + CudaRadixCiphertextFFI *buffer_out = new CudaRadixCiphertextFFI; + CudaRadixCiphertextFFI *condition_array = new CudaRadixCiphertextFFI; int_radix_params params; @@ -2978,15 +2979,18 @@ template struct int_cmux_buffer { this->params = params; if (allocate_gpu_memory) { - Torus big_size = - (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); - - buffer_in = - (Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]); - buffer_out = - (Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]); - condition_array = - (Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]); + printf("Here\n"); + create_trivial_radix_ciphertext_async( + streams[0], gpu_indexes[0], buffer_in, 2 * num_radix_blocks, + params.big_lwe_dimension); + printf("Here\n"); + create_trivial_radix_ciphertext_async( + streams[0], gpu_indexes[0], buffer_out, 2 * num_radix_blocks, + params.big_lwe_dimension); + printf("Here\n"); + create_trivial_radix_ciphertext_async( + streams[0], gpu_indexes[0], condition_array, 2 * num_radix_blocks, + params.big_lwe_dimension); auto lut_f = [predicate_lut_f](Torus block, Torus condition) -> Torus { return predicate_lut_f(condition) ? 0 : block; @@ -3047,9 +3051,12 @@ template struct int_cmux_buffer { message_extract_lut->release(streams, gpu_indexes, gpu_count); delete message_extract_lut; - cuda_drop_async(buffer_in, streams[0], gpu_indexes[0]); - cuda_drop_async(buffer_out, streams[0], gpu_indexes[0]); - cuda_drop_async(condition_array, streams[0], gpu_indexes[0]); + release_radix_ciphertext_data(streams[0], gpu_indexes[0], buffer_in); + delete buffer_in; + release_radix_ciphertext_data(streams[0], gpu_indexes[0], buffer_out); + delete buffer_out; + release_radix_ciphertext_data(streams[0], gpu_indexes[0], condition_array); + delete condition_array; } }; @@ -4351,7 +4358,7 @@ template struct int_abs_buffer { int_sc_prop_memory *scp_mem; int_bitop_buffer *bitxor_mem; - Torus *mask; + CudaRadixCiphertextFFI *mask; int_abs_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks, bool allocate_gpu_memory) { @@ -4372,11 +4379,9 @@ template struct int_abs_buffer { streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params, num_radix_blocks, allocate_gpu_memory); - uint32_t lwe_size = params.big_lwe_dimension + 1; - uint32_t lwe_size_bytes = lwe_size * sizeof(Torus); - - mask = (Torus *)cuda_malloc_async(num_radix_blocks * lwe_size_bytes, - streams[0], gpu_indexes[0]); + create_trivial_radix_ciphertext_async(streams[0], gpu_indexes[0], + mask, num_radix_blocks, + params.big_lwe_dimension); } } diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h b/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h new file mode 100644 index 0000000000..f42a06b350 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h @@ -0,0 +1,31 @@ +#ifndef CUDA_RADIX_CIPHERTEXT_H +#define CUDA_RADIX_CIPHERTEXT_H + +#include "device.h" +#include "integer.h" + +template +void create_trivial_radix_ciphertext_async(cudaStream_t const stream, + uint32_t const gpu_index, + CudaRadixCiphertextFFI *output_radix, + uint32_t num_radix_blocks, + uint32_t lwe_dimension); + +// end_lwe_index is inclusive +template +void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix, + const CudaRadixCiphertextFFI *input_radix, + uint32_t start_lwe_index, + uint32_t end_lwe_index); + +template +void copy_radix_ciphertext_to_larger_output_slice_async( + cudaStream_t const stream, uint32_t const gpu_index, + CudaRadixCiphertextFFI *output_radix, + const CudaRadixCiphertextFFI *input_radix, uint32_t output_start_lwe_index); + +void release_radix_ciphertext_data(cudaStream_t const stream, + uint32_t const gpu_index, + CudaRadixCiphertextFFI *data); + +#endif // CUDA_RADIX_CIPHERTEXT_H diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h index 886a9befd4..7f9ff3164d 100644 --- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h +++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h @@ -1,6 +1,7 @@ #ifndef CUDA_LINALG_H_ #define CUDA_LINALG_H_ +#include "integer/integer.h" #include extern "C" { @@ -14,16 +15,13 @@ void cuda_negate_lwe_ciphertext_vector_64( void const *lwe_array_in, const uint32_t input_lwe_dimension, const uint32_t input_lwe_ciphertext_count); void cuda_add_lwe_ciphertext_vector_32( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void const *lwe_array_in_1, void const *lwe_array_in_2, - const uint32_t input_lwe_dimension, - const uint32_t input_lwe_ciphertext_count); + void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out, + CudaRadixCiphertextFFI const *lwe_array_in_1, + CudaRadixCiphertextFFI const *lwe_array_in_2); void cuda_add_lwe_ciphertext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void const *lwe_array_in_1, void const *lwe_array_in_2, - const uint32_t input_lwe_dimension, - const uint32_t input_lwe_ciphertext_count); - + void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out, + CudaRadixCiphertextFFI const *lwe_array_in_1, + CudaRadixCiphertextFFI const *lwe_array_in_2); void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, void const *lwe_array_in, void const *plaintext_array_in, diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 041e228b3d..78ce8fa96c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -45,6 +45,13 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) { check_cuda_error(cudaStreamSynchronize(stream)); } +void synchronize_streams(cudaStream_t const *streams, + uint32_t const *gpu_indexes, uint32_t gpu_count) { + for (uint i = 0; i < gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } +} + // Determine if a CUDA device is available at runtime uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu index 1f462753ee..821015f251 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu @@ -22,15 +22,14 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64( void cuda_integer_abs_inplace_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks, - void *const *ksks, uint32_t num_blocks) { + CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed, + void *const *bsks, void *const *ksks) { auto mem = (int_abs_buffer *)mem_ptr; host_integer_abs_kb((cudaStream_t *)(streams), gpu_indexes, - gpu_count, static_cast(ct), bsks, - (uint64_t **)(ksks), mem, is_signed, - num_blocks); + gpu_count, ct, bsks, (uint64_t **)(ksks), mem, + is_signed); } void cleanup_cuda_integer_abs_inplace(void *const *streams, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh index d9053bbfbd..a60183b69d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh @@ -8,6 +8,7 @@ #include "integer/integer.cuh" #include "integer/integer_utilities.h" #include "integer/negation.cuh" +#include "integer/radix_ciphertext.h" #include "integer/scalar_shifts.cuh" #include "linear_algebra.h" #include "pbs/programmable_bootstrap.h" @@ -32,16 +33,15 @@ __host__ void scratch_cuda_integer_abs_kb( } template -__host__ void -host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, Torus *ct, void *const *bsks, - uint64_t *const *ksks, int_abs_buffer *mem_ptr, - bool is_signed, uint32_t num_blocks) { +__host__ void legacy_host_integer_abs_kb_async( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, Torus *ct, void *const *bsks, uint64_t *const *ksks, + int_abs_buffer *mem_ptr, bool is_signed, uint32_t num_blocks) { if (!is_signed) return; auto radix_params = mem_ptr->params; - auto mask = mem_ptr->mask; + auto mask = (Torus *)(mem_ptr->mask->ptr); auto big_lwe_dimension = radix_params.big_lwe_dimension; auto big_lwe_size = big_lwe_dimension + 1; @@ -52,11 +52,11 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes, cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]); - host_integer_radix_arithmetic_scalar_shift_kb_inplace( + host_integer_radix_arithmetic_scalar_shift_kb_inplace( streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1, mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks); - host_addition(streams[0], gpu_indexes[0], ct, mask, ct, - radix_params.big_lwe_dimension, num_blocks); + legacy_host_addition(streams[0], gpu_indexes[0], ct, mask, ct, + radix_params.big_lwe_dimension, num_blocks); uint32_t requested_flag = outputFlag::FLAG_NONE; uint32_t uses_carry = 0; @@ -64,8 +64,46 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes, streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks, num_blocks, requested_flag, uses_carry); - host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct, - mem_ptr->bitxor_mem, bsks, ksks, num_blocks); + host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, + ct, mem_ptr->bitxor_mem, bsks, ksks, + num_blocks); +} + +template +__host__ void +host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, CudaRadixCiphertextFFI *ct, + void *const *bsks, uint64_t *const *ksks, + int_abs_buffer *mem_ptr, bool is_signed) { + if (!is_signed) + return; + + auto mask = mem_ptr->mask; + + uint32_t num_bits_in_ciphertext = + (31 - __builtin_clz(mem_ptr->params.message_modulus)) * + ct->num_radix_blocks; + + copy_radix_ciphertext_to_larger_output_slice_async( + streams[0], gpu_indexes[0], mask, ct, 0); + + host_integer_radix_arithmetic_scalar_shift_kb_inplace( + streams, gpu_indexes, gpu_count, (Torus *)(mask->ptr), + num_bits_in_ciphertext - 1, mem_ptr->arithmetic_scalar_shift_mem, bsks, + ksks, ct->num_radix_blocks); + host_addition(streams[0], gpu_indexes[0], ct, mask, ct); + + uint32_t requested_flag = outputFlag::FLAG_NONE; + uint32_t uses_carry = 0; + host_propagate_single_carry( + streams, gpu_indexes, gpu_count, (Torus *)(ct->ptr), nullptr, nullptr, + mem_ptr->scp_mem, bsks, ksks, ct->num_radix_blocks, requested_flag, + uses_carry); + + host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, + (Torus *)(ct->ptr), (Torus *)(mask->ptr), + (Torus *)(ct->ptr), mem_ptr->bitxor_mem, + bsks, ksks, ct->num_radix_blocks); } #endif // TFHE_RS_ABS_CUH diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu index bf2e7eeafe..e51eed1170 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu @@ -25,19 +25,16 @@ void scratch_cuda_integer_radix_cmux_kb_64( void cuda_cmux_integer_radix_ciphertext_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true, - void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks, - void *const *ksks, uint32_t lwe_ciphertext_count) { + CudaRadixCiphertextFFI *lwe_array_out, + CudaRadixCiphertextFFI const *lwe_condition, + CudaRadixCiphertextFFI const *lwe_array_true, + CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr, + void *const *bsks, void *const *ksks) { host_integer_radix_cmux_kb( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array_out), - static_cast(lwe_condition), - static_cast(lwe_array_true), - static_cast(lwe_array_false), - (int_cmux_buffer *)mem_ptr, bsks, (uint64_t **)(ksks), - - lwe_ciphertext_count); + (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out, + lwe_condition, lwe_array_true, lwe_array_false, + (int_cmux_buffer *)mem_ptr, bsks, (uint64_t **)(ksks)); } void cleanup_cuda_integer_radix_cmux(void *const *streams, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index b74bdfbe91..cc827d2499 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -1,6 +1,7 @@ #ifndef CUDA_INTEGER_CMUX_CUH #define CUDA_INTEGER_CMUX_CUH +#include "../include/integer/radix_ciphertext.h" #include "integer.cuh" template @@ -29,7 +30,7 @@ __host__ void zero_out_if(cudaStream_t const *streams, } template -__host__ void host_integer_radix_cmux_kb( +__host__ void legacy_host_integer_radix_cmux_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition, Torus const *lwe_array_true, Torus const *lwe_array_false, @@ -39,36 +40,96 @@ __host__ void host_integer_radix_cmux_kb( auto params = mem_ptr->params; Torus lwe_size = params.big_lwe_dimension + 1; Torus radix_lwe_size = lwe_size * num_radix_blocks; - cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in, lwe_array_true, + cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in->ptr, lwe_array_true, radix_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]); - cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in + radix_lwe_size, - lwe_array_false, radix_lwe_size * sizeof(Torus), - streams[0], gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + (Torus *)(mem_ptr->buffer_in->ptr) + radix_lwe_size, lwe_array_false, + radix_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]); for (uint i = 0; i < 2 * num_radix_blocks; i++) { - cuda_memcpy_async_gpu_to_gpu(mem_ptr->condition_array + i * lwe_size, - lwe_condition, lwe_size * sizeof(Torus), - streams[0], gpu_indexes[0]); + cuda_memcpy_async_gpu_to_gpu( + (Torus *)(mem_ptr->condition_array->ptr) + i * lwe_size, lwe_condition, + lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]); } integer_radix_apply_bivariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in, - mem_ptr->condition_array, bsks, ksks, 2 * num_radix_blocks, - mem_ptr->predicate_lut, params.message_modulus); + streams, gpu_indexes, gpu_count, (Torus *)(mem_ptr->buffer_out->ptr), + (Torus *)(mem_ptr->buffer_in->ptr), + (Torus *)(mem_ptr->condition_array->ptr), bsks, ksks, + 2 * num_radix_blocks, mem_ptr->predicate_lut, params.message_modulus); // If the condition was true, true_ct will have kept its value and false_ct // will be 0 If the condition was false, true_ct will be 0 and false_ct will // have kept its value - auto mem_true = mem_ptr->buffer_out; - auto mem_false = &mem_ptr->buffer_out[radix_lwe_size]; + auto mem_true = (Torus *)(mem_ptr->buffer_out->ptr); + auto ptr = (Torus *)mem_ptr->buffer_out->ptr; + auto mem_false = &ptr[radix_lwe_size]; auto added_cts = mem_true; - host_addition(streams[0], gpu_indexes[0], added_cts, mem_true, - mem_false, params.big_lwe_dimension, num_radix_blocks); + legacy_host_addition(streams[0], gpu_indexes[0], added_cts, mem_true, + mem_false, params.big_lwe_dimension, + num_radix_blocks); integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks, num_radix_blocks, mem_ptr->message_extract_lut); } +template +__host__ void host_integer_radix_cmux_kb( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out, + CudaRadixCiphertextFFI const *lwe_condition, + CudaRadixCiphertextFFI const *lwe_array_true, + CudaRadixCiphertextFFI const *lwe_array_false, + int_cmux_buffer *mem_ptr, void *const *bsks, Torus *const *ksks) { + + if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks) + PANIC("Cuda error: input and output num radix blocks must be the same") + if (lwe_array_out->num_radix_blocks != lwe_array_false->num_radix_blocks) + PANIC("Cuda error: input and output num radix blocks must be the same") + + auto num_radix_blocks = lwe_array_out->num_radix_blocks; + auto params = mem_ptr->params; + Torus lwe_size = params.big_lwe_dimension + 1; + Torus radix_lwe_size = lwe_size * num_radix_blocks; + copy_radix_ciphertext_to_larger_output_slice_async( + streams[0], gpu_indexes[0], mem_ptr->buffer_in, lwe_array_true, 0); + copy_radix_ciphertext_to_larger_output_slice_async( + streams[0], gpu_indexes[0], mem_ptr->buffer_in, lwe_array_false, + num_radix_blocks); + for (uint i = 0; i < 2 * num_radix_blocks; i++) { + cuda_memcpy_async_gpu_to_gpu( + (Torus *)(mem_ptr->condition_array->ptr) + i * lwe_size, + (Torus *)(lwe_condition->ptr), lwe_size * sizeof(Torus), streams[0], + gpu_indexes[0]); + } + integer_radix_apply_bivariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, (Torus *)(mem_ptr->buffer_out->ptr), + (Torus *)(mem_ptr->buffer_in->ptr), + (Torus *)(mem_ptr->condition_array->ptr), bsks, ksks, + 2 * num_radix_blocks, mem_ptr->predicate_lut, params.message_modulus); + + // If the condition was true, true_ct will have kept its value and false_ct + // will be 0 If the condition was false, true_ct will be 0 and false_ct will + // have kept its value + CudaRadixCiphertextFFI *mem_true = new CudaRadixCiphertextFFI; + CudaRadixCiphertextFFI *mem_false = new CudaRadixCiphertextFFI; + as_radix_ciphertext_slice(mem_true, mem_ptr->buffer_out, 0, + num_radix_blocks - 1); + as_radix_ciphertext_slice(mem_false, mem_ptr->buffer_out, + num_radix_blocks, 2 * num_radix_blocks - 1); + + auto added_cts = mem_true; + host_addition(streams[0], gpu_indexes[0], added_cts, mem_true, + mem_false); + + integer_radix_apply_univariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, (Torus *)(lwe_array_out->ptr), + (Torus *)(added_cts->ptr), bsks, ksks, num_radix_blocks, + mem_ptr->message_extract_lut); + delete mem_true; + delete mem_false; +} + template __host__ void scratch_cuda_integer_radix_cmux_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index 3f82be99d6..ed311f6389 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -620,7 +620,7 @@ __host__ void host_integer_radix_maxmin_kb( ksks, total_num_radix_blocks); // Selector - host_integer_radix_cmux_kb( + legacy_host_integer_radix_cmux_kb( streams, gpu_indexes, gpu_count, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index ab574705f6..b080c5edf7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -402,7 +402,7 @@ __host__ void host_unsigned_integer_div_rem_kb( // but in that position, interesting_remainder2 always has a 0 auto &merged_interesting_remainder = interesting_remainder1; - host_addition( + legacy_host_addition( streams[0], gpu_indexes[0], merged_interesting_remainder.data, merged_interesting_remainder.data, interesting_remainder2.data, radix_params.big_lwe_dimension, merged_interesting_remainder.len); @@ -507,10 +507,10 @@ __host__ void host_unsigned_integer_div_rem_kb( cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]); } - host_addition(streams[0], gpu_indexes[0], overflow_sum.data, - subtraction_overflowed.data, - at_least_one_upper_block_is_non_zero.data, - radix_params.big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], overflow_sum.data, + subtraction_overflowed.data, + at_least_one_upper_block_is_non_zero.data, + radix_params.big_lwe_dimension, 1); int factor = (i) ? 3 : 2; int factor_lut_id = factor - 2; @@ -552,7 +552,7 @@ __host__ void host_unsigned_integer_div_rem_kb( mem_ptr->merge_overflow_flags_luts[pos_in_block] ->params.message_modulus); - host_addition( + legacy_host_addition( streams[0], gpu_indexes[0], "ient[block_of_bit * big_lwe_size], "ient[block_of_bit * big_lwe_size], did_not_overflow.data, radix_params.big_lwe_dimension, 1); @@ -588,9 +588,9 @@ __host__ void host_unsigned_integer_div_rem_kb( // Clean the quotient and remainder // as even though they have no carries, they are not at nominal noise level - host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data, - remainder2.data, radix_params.big_lwe_dimension, - remainder1.len); + legacy_host_addition(streams[0], gpu_indexes[0], remainder, + remainder1.data, remainder2.data, + radix_params.big_lwe_dimension, remainder1.len); for (uint j = 0; j < gpu_count; j++) { cuda_synchronize_stream(streams[j], gpu_indexes[j]); @@ -636,12 +636,14 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams, cuda_synchronize_stream(streams[j], gpu_indexes[j]); } - host_integer_abs_kb(int_mem_ptr->sub_streams_1, gpu_indexes, - gpu_count, positive_numerator.data, bsks, ksks, - int_mem_ptr->abs_mem_1, true, num_blocks); - host_integer_abs_kb(int_mem_ptr->sub_streams_2, gpu_indexes, - gpu_count, positive_divisor.data, bsks, ksks, - int_mem_ptr->abs_mem_2, true, num_blocks); + legacy_host_integer_abs_kb_async( + int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, + positive_numerator.data, bsks, ksks, int_mem_ptr->abs_mem_1, true, + num_blocks); + legacy_host_integer_abs_kb_async( + int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, + positive_divisor.data, bsks, ksks, int_mem_ptr->abs_mem_2, true, + num_blocks); for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]); @@ -689,12 +691,12 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams, int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks, requested_flag, uses_carry); - host_integer_radix_cmux_kb( + legacy_host_integer_radix_cmux_kb( int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient, int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient, quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks, num_blocks); - host_integer_radix_cmux_kb( + legacy_host_integer_radix_cmux_kb( int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder, &numerator[big_lwe_size * (num_blocks - 1)], int_mem_ptr->negated_remainder, remainder, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index f6e53b5050..e85877211b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -1163,64 +1163,6 @@ void host_compute_shifted_blocks_and_borrow_states( big_lwe_size_bytes * num_radix_blocks, streams[0], gpu_indexes[0]); } -template -void host_legacy_propagate_single_carry(cudaStream_t const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, Torus *lwe_array, - Torus *carry_out, Torus *input_carries, - int_legacy_sc_prop_memory *mem, - void *const *bsks, Torus *const *ksks, - uint32_t num_blocks) { - auto params = mem->params; - if (params.message_modulus == 2) - PANIC("Cuda error: single carry propagation is not supported for 1 bit " - "messages") - auto glwe_dimension = params.glwe_dimension; - auto polynomial_size = params.polynomial_size; - auto big_lwe_size = glwe_dimension * polynomial_size + 1; - auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - - auto generates_or_propagates = mem->generates_or_propagates; - auto step_output = mem->step_output; - - auto luts_array = mem->luts_array; - auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum; - auto message_acc = mem->message_acc; - - integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks, - ksks, num_blocks, luts_array); - - // compute prefix sum with hillis&steele - host_compute_prefix_sum_hillis_steele( - streams, gpu_indexes, gpu_count, step_output, generates_or_propagates, - params, luts_carry_propagation_sum, bsks, ksks, num_blocks); - - host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, - step_output, generates_or_propagates, 1, - num_blocks, big_lwe_size); - if (carry_out != nullptr) { - cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes, - streams[0], gpu_indexes[0]); - } - cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0], - gpu_indexes[0]); - - if (input_carries != nullptr) { - cuda_memcpy_async_gpu_to_gpu((void *)input_carries, step_output, - big_lwe_size_bytes * num_blocks, streams[0], - gpu_indexes[0]); - } - - host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, - step_output, glwe_dimension * polynomial_size, - num_blocks); - - integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks, - num_blocks, message_acc); -} - template void host_generate_last_block_inner_propagation( cudaStream_t const *streams, uint32_t const *gpu_indexes, @@ -1336,10 +1278,10 @@ void host_full_propagate_inplace(cudaStream_t const *streams, if (i < num_blocks - 1) { auto next_input_block = &input_blocks[(i + 1) * big_lwe_size]; - host_addition(streams[0], gpu_indexes[0], next_input_block, - (Torus const *)next_input_block, - &mem_ptr->tmp_big_lwe_vector[big_lwe_size], - params.big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], next_input_block, + (Torus const *)next_input_block, + &mem_ptr->tmp_big_lwe_vector[big_lwe_size], + params.big_lwe_dimension, 1); } } } @@ -1703,8 +1645,8 @@ void host_propagate_single_carry(cudaStream_t const *streams, PANIC("Cuda error: single carry propagation is not supported for overflow, " "try using add_and_propagate_single_carry"); if (uses_carry == 1) { - host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, - input_carries, big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], lwe_array, + lwe_array, input_carries, big_lwe_dimension, 1); } // Step 1 host_compute_shifted_blocks_and_states( @@ -1728,17 +1670,18 @@ void host_propagate_single_carry(cudaStream_t const *streams, auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks; auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks; - host_addition(streams[0], gpu_indexes[0], prepared_blocks, - shifted_blocks, - mem->prop_simu_group_carries_mem->simulators, - big_lwe_dimension, num_radix_blocks); + legacy_host_addition(streams[0], gpu_indexes[0], prepared_blocks, + shifted_blocks, + mem->prop_simu_group_carries_mem->simulators, + big_lwe_dimension, num_radix_blocks); if (requested_flag == outputFlag::FLAG_OVERFLOW || requested_flag == outputFlag::FLAG_CARRY) { - host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, - mem->prop_simu_group_carries_mem->simulators + - (num_radix_blocks - 1) * big_lwe_size, - big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], output_flag, + output_flag, + mem->prop_simu_group_carries_mem->simulators + + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_dimension, 1); } host_radix_sum_in_groups( @@ -1746,10 +1689,11 @@ void host_propagate_single_carry(cudaStream_t const *streams, mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks, big_lwe_size, group_size); if (requested_flag == outputFlag::FLAG_CARRY) { - host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, - mem->prop_simu_group_carries_mem->resolved_carries + - (mem->num_groups - 1) * big_lwe_size, - big_lwe_dimension, 1); + legacy_host_addition( + streams[0], gpu_indexes[0], output_flag, output_flag, + mem->prop_simu_group_carries_mem->resolved_carries + + (mem->num_groups - 1) * big_lwe_size, + big_lwe_dimension, 1); cuda_memcpy_async_gpu_to_gpu( prepared_blocks + num_radix_blocks * big_lwe_size, output_flag, @@ -1800,12 +1744,12 @@ void host_add_and_propagate_single_carry( big_lwe_size_bytes, streams[0], gpu_indexes[0]); } - host_addition(streams[0], gpu_indexes[0], lhs_array, lhs_array, - rhs_array, big_lwe_dimension, num_radix_blocks); + legacy_host_addition(streams[0], gpu_indexes[0], lhs_array, lhs_array, + rhs_array, big_lwe_dimension, num_radix_blocks); if (uses_carry == 1) { - host_addition(streams[0], gpu_indexes[0], lhs_array, lhs_array, - input_carries, big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], lhs_array, + lhs_array, input_carries, big_lwe_dimension, 1); } // Step 1 host_compute_shifted_blocks_and_states( @@ -1835,17 +1779,18 @@ void host_add_and_propagate_single_carry( auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks; auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks; - host_addition(streams[0], gpu_indexes[0], prepared_blocks, - shifted_blocks, - mem->prop_simu_group_carries_mem->simulators, - big_lwe_dimension, num_radix_blocks); + legacy_host_addition(streams[0], gpu_indexes[0], prepared_blocks, + shifted_blocks, + mem->prop_simu_group_carries_mem->simulators, + big_lwe_dimension, num_radix_blocks); if (requested_flag == outputFlag::FLAG_OVERFLOW || requested_flag == outputFlag::FLAG_CARRY) { - host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, - mem->prop_simu_group_carries_mem->simulators + - (num_radix_blocks - 1) * big_lwe_size, - big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], output_flag, + output_flag, + mem->prop_simu_group_carries_mem->simulators + + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_dimension, 1); } // Step 3 @@ -1859,15 +1804,17 @@ void host_add_and_propagate_single_carry( requested_flag == outputFlag::FLAG_CARRY) { if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW && uses_carry == 1) { - host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, - input_carries, big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], output_flag, + output_flag, input_carries, big_lwe_dimension, + 1); } else { - host_addition(streams[0], gpu_indexes[0], output_flag, output_flag, - mem->prop_simu_group_carries_mem->resolved_carries + - (mem->num_groups - 1) * big_lwe_size, - big_lwe_dimension, 1); + legacy_host_addition( + streams[0], gpu_indexes[0], output_flag, output_flag, + mem->prop_simu_group_carries_mem->resolved_carries + + (mem->num_groups - 1) * big_lwe_size, + big_lwe_dimension, 1); } cuda_memcpy_async_gpu_to_gpu( prepared_blocks + num_radix_blocks * big_lwe_size, output_flag, @@ -1960,11 +1907,11 @@ void host_single_borrow_propagate( num_radix_blocks, message_modulus, carry_modulus); if (compute_overflow == outputFlag::FLAG_OVERFLOW) { - host_addition(streams[0], gpu_indexes[0], mem->overflow_block, - mem->overflow_block, - mem->prop_simu_group_carries_mem->simulators + - (num_radix_blocks - 1) * big_lwe_size, - big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], mem->overflow_block, + mem->overflow_block, + mem->prop_simu_group_carries_mem->simulators + + (num_radix_blocks - 1) * big_lwe_size, + big_lwe_dimension, 1); } auto resolved_borrows = mem->prop_simu_group_carries_mem->resolved_carries; @@ -1972,10 +1919,10 @@ void host_single_borrow_propagate( // This needs to be done before because in next step we modify the resolved // borrows if (compute_overflow == outputFlag::FLAG_OVERFLOW) { - host_addition(streams[0], gpu_indexes[0], mem->overflow_block, - mem->overflow_block, - resolved_borrows + (num_groups - 1) * big_lwe_size, - big_lwe_dimension, 1); + legacy_host_addition( + streams[0], gpu_indexes[0], mem->overflow_block, mem->overflow_block, + resolved_borrows + (num_groups - 1) * big_lwe_size, big_lwe_dimension, + 1); } cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 590b41e101..a828f70b62 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -228,9 +228,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( streams[0], gpu_indexes[0]); } if (num_radix_in_vec == 2) { - host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, - &old_blocks[num_blocks * big_lwe_size], - big_lwe_dimension, num_blocks); + legacy_host_addition( + streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, + &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks); return; } @@ -445,9 +445,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( luts_message_carry->release(streams, gpu_indexes, gpu_count); delete (luts_message_carry); - host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, - &old_blocks[num_blocks * big_lwe_size], - big_lwe_dimension, num_blocks); + legacy_host_addition( + streams[0], gpu_indexes[0], radix_lwe_out, old_blocks, + &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu new file mode 100644 index 0000000000..f8c072f68f --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu @@ -0,0 +1,87 @@ +#include "integer/radix_ciphertext.h" + +template +void create_trivial_radix_ciphertext_async(cudaStream_t const stream, + uint32_t const gpu_index, + CudaRadixCiphertextFFI *output_radix, + uint32_t num_radix_blocks, + uint32_t lwe_dimension) { + uint32_t lwe_size_bytes = (lwe_dimension + 1) * sizeof(Torus); + output_radix->ptr = (void *)cuda_malloc_async( + num_radix_blocks * lwe_size_bytes, stream, gpu_index); + output_radix->degrees = (Torus *)(malloc(num_radix_blocks * sizeof(Torus))); + output_radix->noise_levels = + (Torus *)(malloc(num_radix_blocks * sizeof(Torus))); + for (uint i = 0; i < output_radix->num_radix_blocks; i++) { + output_radix->degrees[i] = 0; + output_radix->noise_levels[i] = 0; + } + output_radix->lwe_dimension = lwe_dimension; + output_radix->num_radix_blocks = num_radix_blocks; +} + +// end_lwe_index is inclusive +template +void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix, + const CudaRadixCiphertextFFI *input_radix, + uint32_t start_lwe_index, + uint32_t end_lwe_index) { + if (input_radix->num_radix_blocks < start_lwe_index - end_lwe_index + 1) + PANIC("Cuda error: input radix should have more blocks than the specified " + "range") + if (start_lwe_index <= end_lwe_index) + PANIC("Cuda error: slice range should be strictly positive") + + auto lwe_size = input_radix->lwe_dimension + 1; + output_radix->num_radix_blocks = end_lwe_index - start_lwe_index + 1; + output_radix->lwe_dimension = input_radix->lwe_dimension; + Torus *in_ptr = (Torus *)input_radix->ptr; + output_radix->ptr = (void *)(&in_ptr[start_lwe_index * lwe_size]); + for (uint i = 0; i < output_radix->num_radix_blocks; i++) { + output_radix->degrees[i] = + input_radix->degrees[i + start_lwe_index * lwe_size]; + output_radix->noise_levels[i] = + input_radix->noise_levels[i + start_lwe_index * lwe_size]; + } +} + +template +void copy_radix_ciphertext_to_larger_output_slice_async( + cudaStream_t const stream, uint32_t const gpu_index, + CudaRadixCiphertextFFI *output_radix, + const CudaRadixCiphertextFFI *input_radix, + uint32_t output_start_lwe_index) { + if (output_radix->lwe_dimension != input_radix->lwe_dimension) + PANIC("Cuda error: input lwe dimension should be equal to output lwe " + "dimension") + if (output_radix->num_radix_blocks - output_start_lwe_index != + input_radix->num_radix_blocks) + PANIC("Cuda error: input radix should have the same number of blocks as " + "the output range") + if (output_start_lwe_index >= output_radix->num_radix_blocks) + PANIC("Cuda error: output index should be strictly smaller than the number " + "of blocks") + + auto lwe_size = input_radix->lwe_dimension + 1; + Torus *out_ptr = (Torus *)output_radix->ptr; + out_ptr = &out_ptr[output_start_lwe_index * lwe_size]; + + cuda_memcpy_async_gpu_to_gpu(out_ptr, input_radix->ptr, + input_radix->num_radix_blocks * + (input_radix->lwe_dimension + 1) * + sizeof(Torus), + stream, gpu_index); + for (uint i = 0; i < input_radix->num_radix_blocks; i++) { + output_radix->degrees[i + output_start_lwe_index] = input_radix->degrees[i]; + output_radix->noise_levels[i + output_start_lwe_index] = + input_radix->noise_levels[i]; + } +} + +void release_radix_ciphertext_data(cudaStream_t const stream, + uint32_t const gpu_index, + CudaRadixCiphertextFFI *data) { + cuda_drop_async(data->ptr, stream, gpu_index); + free(data->degrees); + free(data->noise_levels); +} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index 4b79a24cec..e70489bf2a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -540,10 +540,10 @@ __host__ void integer_radix_signed_scalar_maxmin_kb( // Selector // CMUX for Max or Min - host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, - lwe_array_out, sign, lwe_array_left, - lwe_array_right, mem_ptr->cmux_buffer, bsks, - ksks, total_num_radix_blocks); + legacy_host_integer_radix_cmux_kb( + streams, gpu_indexes, gpu_count, lwe_array_out, sign, lwe_array_left, + lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks, + total_num_radix_blocks); } template @@ -621,7 +621,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb( // Selector // CMUX for Max or Min - host_integer_radix_cmux_kb( + legacy_host_integer_radix_cmux_kb( streams, gpu_indexes, gpu_count, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index 7a6bfb85ea..5f770ec2b4 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -144,10 +144,10 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // The shift bit is already properly aligned/positioned for (int i = 0; i < total_nb_bits; i++) - host_addition(streams[0], gpu_indexes[0], - mux_inputs + i * big_lwe_size, - mux_inputs + i * big_lwe_size, shift_bit, - mem->params.big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], + mux_inputs + i * big_lwe_size, + mux_inputs + i * big_lwe_size, shift_bit, + mem->params.big_lwe_dimension, 1); // we have // control_bit|b|a @@ -180,8 +180,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( auto bit_to_add = input_bits_a + i * big_lwe_size; for (int j = 0; j < num_radix_blocks; j++) { - host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add, - big_lwe_dimension, 1); + legacy_host_addition(streams[0], gpu_indexes[0], block, block, + bit_to_add, big_lwe_dimension, 1); block += big_lwe_size; bit_to_add += bits_per_block * big_lwe_size; diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu index 2b67655ad6..24bdaf1f46 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu @@ -1,20 +1,13 @@ +#include "integer/integer.h" #include "linearalgebra/addition.cuh" -/* - * Perform the addition of two u32 input LWE ciphertext vectors. - * See the equivalent operation on u64 ciphertexts for more details. - */ void cuda_add_lwe_ciphertext_vector_32( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void const *lwe_array_in_1, void const *lwe_array_in_2, - const uint32_t input_lwe_dimension, - const uint32_t input_lwe_ciphertext_count) { + void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out, + CudaRadixCiphertextFFI const *lwe_array_in_1, + CudaRadixCiphertextFFI const *lwe_array_in_2) { host_addition(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in_1), - static_cast(lwe_array_in_2), - input_lwe_dimension, input_lwe_ciphertext_count); + lwe_array_out, lwe_array_in_1, lwe_array_in_2); } /* @@ -44,16 +37,12 @@ void cuda_add_lwe_ciphertext_vector_32( * that performs the operation on the GPU. */ void cuda_add_lwe_ciphertext_vector_64( - void *stream, uint32_t gpu_index, void *lwe_array_out, - void const *lwe_array_in_1, void const *lwe_array_in_2, - const uint32_t input_lwe_dimension, - const uint32_t input_lwe_ciphertext_count) { + void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out, + CudaRadixCiphertextFFI const *lwe_array_in_1, + CudaRadixCiphertextFFI const *lwe_array_in_2) { host_addition(static_cast(stream), gpu_index, - static_cast(lwe_array_out), - static_cast(lwe_array_in_1), - static_cast(lwe_array_in_2), - input_lwe_dimension, input_lwe_ciphertext_count); + lwe_array_out, lwe_array_in_1, lwe_array_in_2); } /* diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh index 57342d3544..256bccb6e1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh @@ -8,6 +8,7 @@ #include "device.h" #include "helper_multi_gpu.h" +#include "integer/integer.h" #include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" #include @@ -100,10 +101,40 @@ __global__ void addition(T *output, T const *input_1, T const *input_2, // Coefficient-wise addition template -__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output, - T const *input_1, T const *input_2, - const uint32_t input_lwe_dimension, - const uint32_t input_lwe_ciphertext_count) { +__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, + CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *input_1, + CudaRadixCiphertextFFI const *input_2) { + + cudaSetDevice(gpu_index); + // lwe_size includes the presence of the body + // whereas lwe_dimension is the number of elements in the mask + int lwe_size = output->lwe_dimension + 1; + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = output->num_radix_blocks * lwe_size; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + addition<<>>( + static_cast(output->ptr), static_cast(input_1->ptr), + static_cast(input_2->ptr), num_entries); + check_cuda_error(cudaGetLastError()); + for (uint i = 0; i < output->num_radix_blocks; i++) { + output->degrees[i] = input_1->degrees[i] + input_2->degrees[i]; + output->noise_levels[i] = + input_1->noise_levels[i] + input_2->noise_levels[i]; + } +} + +// Coefficient-wise addition +template +__host__ void legacy_host_addition(cudaStream_t stream, uint32_t gpu_index, + T *output, T const *input_1, + T const *input_2, + const uint32_t input_lwe_dimension, + const uint32_t input_lwe_ciphertext_count) { cudaSetDevice(gpu_index); // lwe_size includes the presence of the body diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index fa16da0136..1a805c497f 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -1,8 +1,8 @@ -/* automatically generated by rust-bindgen 0.70.1 */ +/* automatically generated by rust-bindgen 0.71.1 */ use crate::ffi; -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -12,7 +12,7 @@ extern "C" { lwe_dimension: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -22,7 +22,7 @@ extern "C" { lwe_dimension: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_glwe_sample_extract_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -37,7 +37,7 @@ extern "C" { pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0; pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1; pub type PBS_TYPE = ffi::c_uint; -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_compress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -57,7 +57,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_decompress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -79,7 +79,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_compress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -91,7 +91,7 @@ extern "C" { mem_ptr: *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_decompress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -104,7 +104,7 @@ extern "C" { mem_ptr: *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_compress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -112,7 +112,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -141,7 +141,32 @@ pub const COMPARISON_TYPE_LE: COMPARISON_TYPE = 5; pub const COMPARISON_TYPE_MAX: COMPARISON_TYPE = 6; pub const COMPARISON_TYPE_MIN: COMPARISON_TYPE = 7; pub type COMPARISON_TYPE = ffi::c_uint; -extern "C" { +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct CudaRadixCiphertextFFI { + pub ptr: *mut ffi::c_void, + pub degrees: *mut u64, + pub noise_levels: *mut u64, + pub num_radix_blocks: u32, + pub lwe_dimension: u32, +} +#[allow(clippy::unnecessary_operation, clippy::identity_op)] +const _: () = { + ["Size of CudaRadixCiphertextFFI"][::std::mem::size_of::() - 32usize]; + ["Alignment of CudaRadixCiphertextFFI"] + [::std::mem::align_of::() - 8usize]; + ["Offset of field: CudaRadixCiphertextFFI::ptr"] + [::std::mem::offset_of!(CudaRadixCiphertextFFI, ptr) - 0usize]; + ["Offset of field: CudaRadixCiphertextFFI::degrees"] + [::std::mem::offset_of!(CudaRadixCiphertextFFI, degrees) - 8usize]; + ["Offset of field: CudaRadixCiphertextFFI::noise_levels"] + [::std::mem::offset_of!(CudaRadixCiphertextFFI, noise_levels) - 16usize]; + ["Offset of field: CudaRadixCiphertextFFI::num_radix_blocks"] + [::std::mem::offset_of!(CudaRadixCiphertextFFI, num_radix_blocks) - 24usize]; + ["Offset of field: CudaRadixCiphertextFFI::lwe_dimension"] + [::std::mem::offset_of!(CudaRadixCiphertextFFI, lwe_dimension) - 28usize]; +}; +unsafe extern "C" { pub fn scratch_cuda_apply_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -163,7 +188,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_apply_many_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -186,7 +211,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_apply_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -199,7 +224,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_apply_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -207,7 +232,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_apply_bivariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -229,7 +254,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_apply_bivariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -244,7 +269,7 @@ extern "C" { shift: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_apply_bivariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -252,7 +277,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_apply_many_univariate_lut_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -267,7 +292,7 @@ extern "C" { lut_stride: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_full_propagation_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -287,7 +312,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_full_propagation_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -299,7 +324,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_full_propagation( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -307,7 +332,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -330,7 +355,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_mult_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -347,7 +372,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_mult( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -355,7 +380,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_negate_integer_radix_ciphertext_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -368,7 +393,7 @@ extern "C" { carry_modulus: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -381,7 +406,7 @@ extern "C" { carry_modulus: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -404,7 +429,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_logical_scalar_shift_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -417,7 +442,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -440,7 +465,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -453,7 +478,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_logical_scalar_shift( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -461,7 +486,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -469,7 +494,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -493,7 +518,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_shift_and_rotate_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -506,7 +531,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_shift_and_rotate( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -514,7 +539,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_comparison_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -538,7 +563,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_comparison_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -552,7 +577,7 @@ extern "C" { lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -567,7 +592,7 @@ extern "C" { num_scalar_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_comparison( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -575,7 +600,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_bitop_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -598,7 +623,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_bitop_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -612,7 +637,7 @@ extern "C" { lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -628,7 +653,7 @@ extern "C" { op: BITOP_TYPE, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_bitop( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -636,7 +661,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_cmux_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -658,22 +683,21 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_cmux_integer_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - lwe_array_out: *mut ffi::c_void, - lwe_condition: *const ffi::c_void, - lwe_array_true: *const ffi::c_void, - lwe_array_false: *const ffi::c_void, + lwe_array_out: *mut CudaRadixCiphertextFFI, + lwe_condition: *const CudaRadixCiphertextFFI, + lwe_array_true: *const CudaRadixCiphertextFFI, + lwe_array_false: *const CudaRadixCiphertextFFI, mem_ptr: *mut i8, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, - lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_cmux( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -681,7 +705,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -704,7 +728,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -717,7 +741,7 @@ extern "C" { num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_scalar_rotate( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -725,7 +749,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_propagate_single_carry_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -749,7 +773,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -773,7 +797,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_propagate_single_carry_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -789,7 +813,7 @@ extern "C" { uses_carry: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_and_propagate_single_carry_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -806,7 +830,7 @@ extern "C" { uses_carry: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_propagate_single_carry( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -814,7 +838,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_add_and_propagate_single_carry( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -822,7 +846,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -845,7 +869,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_overflowing_sub_kb_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -862,7 +886,7 @@ extern "C" { uses_input_borrow: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_overflowing_sub( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -870,7 +894,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -892,7 +916,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -906,7 +930,7 @@ extern "C" { num_blocks_in_radix: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -914,7 +938,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_scalar_mul_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -935,7 +959,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -953,7 +977,7 @@ extern "C" { num_scalars: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_radix_scalar_mul( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -961,7 +985,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -984,7 +1008,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_div_rem_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1000,7 +1024,7 @@ extern "C" { num_blocks_in_radix: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_div_rem( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1008,7 +1032,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1030,7 +1054,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_compute_prefix_sum_hillis_steele_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1044,7 +1068,7 @@ extern "C" { shift: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1052,7 +1076,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_reverse_blocks_64_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1062,7 +1086,7 @@ extern "C" { lwe_size: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1085,20 +1109,19 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - ct: *mut ffi::c_void, + ct: *mut CudaRadixCiphertextFFI, mem_ptr: *mut i8, is_signed: bool, bsks: *const *mut ffi::c_void, ksks: *const *mut ffi::c_void, - num_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_abs_inplace( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1106,7 +1129,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1128,7 +1151,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_are_all_comparisons_block_true_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1141,7 +1164,7 @@ extern "C" { num_radix_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_are_all_comparisons_block_true( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1149,7 +1172,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1171,7 +1194,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1184,7 +1207,7 @@ extern "C" { num_radix_blocks: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_integer_is_at_least_one_comparisons_block_true( streams: *const *mut ffi::c_void, gpu_indexes: *const u32, @@ -1192,7 +1215,7 @@ extern "C" { mem_ptr_void: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_keyswitch_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1208,7 +1231,7 @@ extern "C" { num_samples: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_keyswitch_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1224,7 +1247,7 @@ extern "C" { num_samples: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1236,7 +1259,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1252,14 +1275,14 @@ extern "C" { num_lwes: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_packing_keyswitch_lwe_list_to_glwe( stream: *mut ffi::c_void, gpu_index: u32, fp_ks_buffer: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_negate_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1269,7 +1292,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_negate_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1279,29 +1302,25 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, - lwe_array_out: *mut ffi::c_void, - lwe_array_in_1: *const ffi::c_void, - lwe_array_in_2: *const ffi::c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, + lwe_array_out: *mut CudaRadixCiphertextFFI, + lwe_array_in_1: *const CudaRadixCiphertextFFI, + lwe_array_in_2: *const CudaRadixCiphertextFFI, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, - lwe_array_out: *mut ffi::c_void, - lwe_array_in_1: *const ffi::c_void, - lwe_array_in_2: *const ffi::c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, + lwe_array_out: *mut CudaRadixCiphertextFFI, + lwe_array_in_1: *const CudaRadixCiphertextFFI, + lwe_array_in_2: *const CudaRadixCiphertextFFI, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1312,7 +1331,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1323,7 +1342,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1334,7 +1353,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1345,7 +1364,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_add_lwe_ciphertext_vector_plaintext_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1356,7 +1375,7 @@ extern "C" { input_lwe_ciphertext_count: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_fourier_polynomial_mul( stream: *mut ffi::c_void, gpu_index: u32, @@ -1367,7 +1386,7 @@ extern "C" { total_polynomials: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_programmable_bootstrap_key_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1379,7 +1398,7 @@ extern "C" { polynomial_size: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_programmable_bootstrap_key_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1391,7 +1410,7 @@ extern "C" { polynomial_size: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_programmable_bootstrap_amortized_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1402,7 +1421,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_programmable_bootstrap_amortized_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1413,7 +1432,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1433,7 +1452,7 @@ extern "C" { num_samples: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1453,14 +1472,14 @@ extern "C" { num_samples: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_programmable_bootstrap_amortized( stream: *mut ffi::c_void, gpu_index: u32, pbs_buffer: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_programmable_bootstrap_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1472,7 +1491,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_programmable_bootstrap_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1484,7 +1503,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32( stream: *mut ffi::c_void, gpu_index: u32, @@ -1506,7 +1525,7 @@ extern "C" { lut_stride: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1528,14 +1547,14 @@ extern "C" { lut_stride: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_programmable_bootstrap( stream: *mut ffi::c_void, gpu_index: u32, pbs_buffer: *mut *mut i8, ); } -extern "C" { +unsafe extern "C" { pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit( glwe_dimension: u32, polynomial_size: u32, @@ -1543,7 +1562,7 @@ extern "C" { num_samples: u32, ) -> bool; } -extern "C" { +unsafe extern "C" { pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1556,7 +1575,7 @@ extern "C" { grouping_factor: u32, ); } -extern "C" { +unsafe extern "C" { pub fn scratch_cuda_multi_bit_programmable_bootstrap_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1568,7 +1587,7 @@ extern "C" { allocate_gpu_memory: bool, ); } -extern "C" { +unsafe extern "C" { pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( stream: *mut ffi::c_void, gpu_index: u32, @@ -1591,7 +1610,7 @@ extern "C" { lut_stride: u32, ); } -extern "C" { +unsafe extern "C" { pub fn cleanup_cuda_multi_bit_programmable_bootstrap( stream: *mut ffi::c_void, gpu_index: u32, diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 7ed226c3f3..7ad081f306 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -421,14 +421,41 @@ pub unsafe fn add_lwe_ciphertext_vector_async( lwe_dimension: LweDimension, num_samples: u32, ) { + let mut degrees_vec: Vec = Vec::with_capacity((num_samples as usize) + 1) + .iter() + .map(|_: &u64| 0) + .collect(); + let mut noise_levels_vec: Vec = Vec::with_capacity((num_samples as usize) + 1) + .iter() + .map(|_: &u64| 0) + .collect(); + let mut lwe_array_out_data = CudaRadixCiphertextFFI { + ptr: lwe_array_out.as_mut_c_ptr(0), + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks: num_samples, + lwe_dimension: lwe_dimension.0 as u32, + }; + let lwe_array_in_1_data = CudaRadixCiphertextFFI { + ptr: lwe_array_in_1.get_mut_c_ptr(0), + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks: num_samples, + lwe_dimension: lwe_dimension.0 as u32, + }; + let lwe_array_in_2_data = CudaRadixCiphertextFFI { + ptr: lwe_array_in_2.get_mut_c_ptr(0), + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks: num_samples, + lwe_dimension: lwe_dimension.0 as u32, + }; cuda_add_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0].0, - lwe_array_out.as_mut_c_ptr(0), - lwe_array_in_1.as_c_ptr(0), - lwe_array_in_2.as_c_ptr(0), - lwe_dimension.0 as u32, - num_samples, + &mut lwe_array_out_data, + &lwe_array_in_1_data, + &lwe_array_in_2_data, ); } @@ -445,14 +472,34 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async( lwe_dimension: LweDimension, num_samples: u32, ) { + let mut degrees_vec: Vec = Vec::with_capacity((num_samples as usize) + 1) + .iter() + .map(|_: &u64| 0) + .collect(); + let mut noise_levels_vec: Vec = Vec::with_capacity((num_samples as usize) + 1) + .iter() + .map(|_: &u64| 0) + .collect(); + let mut lwe_array_out_data = CudaRadixCiphertextFFI { + ptr: lwe_array_out.as_mut_c_ptr(0), + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks: num_samples, + lwe_dimension: lwe_dimension.0 as u32, + }; + let lwe_array_in_data = CudaRadixCiphertextFFI { + ptr: lwe_array_in.get_mut_c_ptr(0), + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks: num_samples, + lwe_dimension: lwe_dimension.0 as u32, + }; cuda_add_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0].0, - lwe_array_out.as_mut_c_ptr(0), - lwe_array_out.as_c_ptr(0), - lwe_array_in.as_c_ptr(0), - lwe_dimension.0 as u32, - num_samples, + &mut lwe_array_out_data, + &lwe_array_out_data, + &lwe_array_in_data, ); } diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs index 0a6316a02c..613d8d61ed 100644 --- a/tfhe/src/integer/gpu/ciphertext/info.rs +++ b/tfhe/src/integer/gpu/ciphertext/info.rs @@ -72,23 +72,6 @@ impl CudaRadixCiphertextInfo { Some(decomposer) } - pub(crate) fn after_add(&self, other: &Self) -> Self { - Self { - blocks: self - .blocks - .iter() - .zip(&other.blocks) - .map(|(left, right)| CudaBlockInfo { - degree: left.degree + right.degree, - message_modulus: left.message_modulus, - carry_modulus: left.carry_modulus, - pbs_order: left.pbs_order, - noise_level: left.noise_level + right.noise_level, - }) - .collect(), - } - } - pub(crate) fn after_neg(&self) -> Self { let new_degrees_iter = NegatedDegreeIter::new( self.blocks diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 6fc3dcac4b..542c77e8b1 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -15,7 +15,10 @@ use crate::shortint::{CarryModulus, MessageModulus}; pub use server_key::CudaServerKey; use std::cmp::min; +use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; +use crate::integer::gpu::ciphertext::CudaRadixCiphertext; use crate::integer::server_key::radix_parallel::OutputFlag; +use crate::shortint::ciphertext::{Degree, NoiseLevel}; use tfhe_cuda_backend::bindings::*; use tfhe_cuda_backend::cuda_bind::*; @@ -57,6 +60,45 @@ pub enum ComparisonType { MIN = 7, } +fn prepare_cuda_radix_ciphertext_ffi(input: &CudaRadixCiphertext) -> CudaRadixCiphertextFFI { + let mut degrees_vec: Vec = input.info.blocks.iter().map(|b| b.degree.0).collect(); + let mut noise_levels_vec: Vec = + input.info.blocks.iter().map(|b| b.noise_level.0).collect(); + CudaRadixCiphertextFFI { + ptr: input.d_blocks.0.d_vec.get_mut_c_ptr(0), + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32, + lwe_dimension: input.d_blocks.0.lwe_dimension.0 as u32, + } +} + +fn prepare_cuda_boolean_block_ffi(input: &CudaBooleanBlock) -> CudaRadixCiphertextFFI { + let mut degrees_vec: Vec = input + .0 + .ciphertext + .info + .blocks + .iter() + .map(|b| b.degree.0) + .collect(); + let mut noise_levels_vec: Vec = input + .0 + .ciphertext + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + CudaRadixCiphertextFFI { + ptr: input.0.ciphertext.d_blocks.0.d_vec.get_mut_c_ptr(0), + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks: input.0.ciphertext.d_blocks.0.lwe_ciphertext_count.0 as u32, + lwe_dimension: input.0.ciphertext.d_blocks.0.lwe_dimension.0 as u32, + } +} + pub fn gen_keys_gpu

(parameters_set: P, streams: &CudaStreams) -> (ClientKey, CudaServerKey) where P: TryInto, @@ -490,32 +532,40 @@ pub unsafe fn decompress_integer_radix_async( /// /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization /// is required -pub unsafe fn unchecked_add_integer_radix_assign_async( +pub unsafe fn unchecked_add_integer_radix_assign( streams: &CudaStreams, - radix_lwe_left: &mut CudaVec, - radix_lwe_right: &CudaVec, - lwe_dimension: LweDimension, - num_blocks: u32, + radix_lwe_left: &mut CudaRadixCiphertext, + radix_lwe_right: &CudaRadixCiphertext, ) { assert_eq!( streams.gpu_indexes[0], - radix_lwe_left.gpu_index(0), + radix_lwe_left.d_blocks.0.d_vec.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - radix_lwe_right.gpu_index(0), + radix_lwe_right.d_blocks.0.d_vec.gpu_index(0), "GPU error: all data should reside on the same GPU." ); + let mut radix_lwe_left_data = prepare_cuda_radix_ciphertext_ffi(radix_lwe_left); + let radix_lwe_right_data = prepare_cuda_radix_ciphertext_ffi(radix_lwe_right); cuda_add_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0].0, - radix_lwe_left.as_mut_c_ptr(0), - radix_lwe_left.as_c_ptr(0), - radix_lwe_right.as_c_ptr(0), - lwe_dimension.0 as u32, - num_blocks, - ); + &mut radix_lwe_left_data, + &radix_lwe_left_data, + &radix_lwe_right_data, + ); + radix_lwe_left + .info + .blocks + .iter_mut() + .enumerate() + .for_each(|(i, b)| { + b.degree = Degree(*radix_lwe_left_data.degrees.wrapping_add(i)); + b.noise_level = NoiseLevel(*radix_lwe_left_data.noise_levels.wrapping_add(i)); + }); + streams.synchronize(); } #[allow(clippy::too_many_arguments)] @@ -2089,12 +2139,12 @@ pub unsafe fn unchecked_rotate_left_integer_radix_kb_assign_async< /// /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization /// is required -pub unsafe fn unchecked_cmux_integer_radix_kb_async( +pub unsafe fn unchecked_cmux_integer_radix_kb( streams: &CudaStreams, - radix_lwe_out: &mut CudaVec, - radix_lwe_condition: &CudaVec, - radix_lwe_true: &CudaVec, - radix_lwe_false: &CudaVec, + radix_lwe_out: &mut CudaRadixCiphertext, + radix_lwe_condition: &CudaBooleanBlock, + radix_lwe_true: &CudaRadixCiphertext, + radix_lwe_false: &CudaRadixCiphertext, bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, message_modulus: MessageModulus, @@ -2113,22 +2163,28 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async>() .as_ptr(), streams.len() as u32, - radix_lwe_out.as_mut_c_ptr(0), - radix_lwe_condition.as_c_ptr(0), - radix_lwe_true.as_c_ptr(0), - radix_lwe_false.as_c_ptr(0), + &mut radix_lwe_out_data, + &condition_data, + &radix_lwe_true_data, + &radix_lwe_false_data, mem_ptr, bootstrapping_key.ptr.as_ptr(), keyswitch_key.ptr.as_ptr(), - num_blocks, ); cleanup_cuda_integer_radix_cmux( streams.ptr.as_ptr(), @@ -2196,6 +2255,7 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async( +pub unsafe fn unchecked_signed_abs_radix_kb_assign( streams: &CudaStreams, - ct: &mut CudaVec, + ct: &mut CudaRadixCiphertext, bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, message_modulus: MessageModulus, @@ -3169,7 +3229,13 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async>() .as_ptr(), streams.len() as u32, - ct.as_mut_c_ptr(0), + &mut ct_data, mem_ptr, true, bootstrapping_key.ptr.as_ptr(), keyswitch_key.ptr.as_ptr(), - num_blocks, ); cleanup_cuda_integer_abs_inplace( streams.ptr.as_ptr(), @@ -3223,6 +3288,7 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async(&self, ct: &mut T, streams: &CudaStreams) + pub fn unchecked_abs_assign(&self, ct: &mut T, streams: &CudaStreams) where T: CudaIntegerRadixCiphertext, { let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - unchecked_signed_abs_radix_kb_assign_async( - streams, - &mut ct.as_mut().d_blocks.0.d_vec, - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - num_blocks, - PBSType::Classical, - LweBskGroupingFactor(0), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - unchecked_signed_abs_radix_kb_assign_async( - streams, - &mut ct.as_mut().d_blocks.0.d_vec, - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - num_blocks, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - ); - } - }; + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + unchecked_signed_abs_radix_kb_assign( + streams, + ct.as_mut(), + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + num_blocks, + PBSType::Classical, + LweBskGroupingFactor(0), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + unchecked_signed_abs_radix_kb_assign( + streams, + ct.as_mut(), + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + num_blocks, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + ); + } + }; + } } pub fn unchecked_abs(&self, ct: &T, streams: &CudaStreams) -> T where T: CudaIntegerRadixCiphertext, { - let mut res = unsafe { ct.duplicate_async(streams) }; + let mut res = ct.duplicate(streams); if T::IS_SIGNED { - unsafe { self.unchecked_abs_assign_async(&mut res, streams) }; + self.unchecked_abs_assign(&mut res, streams); } - streams.synchronize(); res } @@ -130,14 +131,13 @@ impl CudaServerKey { where T: CudaIntegerRadixCiphertext, { - let mut res = unsafe { ct.duplicate_async(streams) }; + let mut res = ct.duplicate(streams); if !ct.block_carries_are_empty() { - unsafe { self.full_propagate_assign_async(&mut res, streams) }; + self.full_propagate_assign(&mut res, streams); }; if T::IS_SIGNED { - unsafe { self.unchecked_abs_assign_async(&mut res, streams) }; + self.unchecked_abs_assign(&mut res, streams); } - streams.synchronize(); res } } diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs index 060fbee8c3..ffebbccd2c 100644 --- a/tfhe/src/integer/gpu/server_key/radix/add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/add.rs @@ -7,7 +7,7 @@ use crate::integer::gpu::ciphertext::{ }; use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey}; use crate::integer::gpu::{ - unchecked_add_integer_radix_assign_async, + unchecked_add_integer_radix_assign, unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, PBSType, }; use crate::integer::server_key::radix_parallel::OutputFlag; @@ -70,7 +70,7 @@ impl CudaServerKey { ct_right: &T, streams: &CudaStreams, ) -> T { - let mut result = unsafe { ct_left.duplicate_async(streams) }; + let mut result = ct_left.duplicate(streams); self.add_assign(&mut result, ct_right, streams); result } @@ -94,18 +94,18 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { - self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign(ct_left, streams); (ct_left, ct_right) } (false, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(ct_left, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(ct_left, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } }; @@ -179,7 +179,7 @@ impl CudaServerKey { /// /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must /// not be dropped until stream is synchronised - pub unsafe fn unchecked_add_assign_async( + pub fn unchecked_add_assign( &self, ct_left: &mut T, ct_right: &T, @@ -203,30 +203,9 @@ impl CudaServerKey { ciphertext_right.d_blocks.ciphertext_modulus() ); - let lwe_dimension = ciphertext_left.d_blocks.lwe_dimension(); - let lwe_ciphertext_count = ciphertext_left.d_blocks.lwe_ciphertext_count(); - - unchecked_add_integer_radix_assign_async( - streams, - &mut ciphertext_left.d_blocks.0.d_vec, - &ciphertext_right.d_blocks.0.d_vec, - lwe_dimension, - lwe_ciphertext_count.0 as u32, - ); - - ciphertext_left.info = ciphertext_left.info.after_add(&ciphertext_right.info); - } - - pub fn unchecked_add_assign( - &self, - ct_left: &mut T, - ct_right: &T, - streams: &CudaStreams, - ) { unsafe { - self.unchecked_add_assign_async(ct_left, ct_right, streams); + unchecked_add_integer_radix_assign(streams, ciphertext_left, ciphertext_right); } - streams.synchronize(); } /// # Safety @@ -417,7 +396,7 @@ impl CudaServerKey { .iter_mut() .filter(|ct| !ct.block_carries_are_empty()) .for_each(|ct| { - self.full_propagate_assign_async(&mut *ct, streams); + self.full_propagate_assign(&mut *ct, streams); }); Some(self.unchecked_sum_ciphertexts_async(&ciphertexts, streams)) @@ -477,14 +456,14 @@ impl CudaServerKey { (true, false) => { unsafe { tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign(&mut tmp_rhs, stream); } (ct_left, &tmp_rhs) } (false, true) => { unsafe { tmp_lhs = ct_left.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + self.full_propagate_assign(&mut tmp_lhs, stream); } (&tmp_lhs, ct_right) } @@ -493,8 +472,8 @@ impl CudaServerKey { tmp_lhs = ct_left.duplicate_async(stream); tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign(&mut tmp_lhs, stream); + self.full_propagate_assign(&mut tmp_rhs, stream); } (&tmp_lhs, &tmp_rhs) @@ -664,14 +643,14 @@ impl CudaServerKey { (true, false) => { unsafe { tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign(&mut tmp_rhs, stream); } (ct_left, &tmp_rhs) } (false, true) => { unsafe { tmp_lhs = ct_left.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + self.full_propagate_assign(&mut tmp_lhs, stream); } (&tmp_lhs, ct_right) } @@ -680,8 +659,8 @@ impl CudaServerKey { tmp_lhs = ct_left.duplicate_async(stream); tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign(&mut tmp_lhs, stream); + self.full_propagate_assign(&mut tmp_rhs, stream); } (&tmp_lhs, &tmp_rhs) diff --git a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs index 0e77fda595..ea37d47dd2 100644 --- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs +++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs @@ -464,18 +464,18 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { - self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign(ct_left, streams); (ct_left, ct_right) } (false, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(ct_left, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(ct_left, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } } @@ -569,18 +569,18 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { - self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign(ct_left, streams); (ct_left, ct_right) } (false, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(ct_left, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(ct_left, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } }; @@ -674,18 +674,18 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { - self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign(ct_left, streams); (ct_left, ct_right) } (false, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(ct_left, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(ct_left, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } }; @@ -763,7 +763,7 @@ impl CudaServerKey { streams: &CudaStreams, ) { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); + self.full_propagate_assign(ct, streams); } self.unchecked_bitnot_assign_async(ct, streams); diff --git a/tfhe/src/integer/gpu/server_key/radix/cmux.rs b/tfhe/src/integer/gpu/server_key/radix/cmux.rs index 864a7914cd..421d66854d 100644 --- a/tfhe/src/integer/gpu/server_key/radix/cmux.rs +++ b/tfhe/src/integer/gpu/server_key/radix/cmux.rs @@ -3,14 +3,10 @@ use crate::core_crypto::prelude::LweBskGroupingFactor; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext; use crate::integer::gpu::server_key::CudaBootstrappingKey; -use crate::integer::gpu::{unchecked_cmux_integer_radix_kb_async, CudaServerKey, PBSType}; +use crate::integer::gpu::{unchecked_cmux_integer_radix_kb, CudaServerKey, PBSType}; impl CudaServerKey { - /// # Safety - /// - /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until stream is synchronised - pub unsafe fn unchecked_if_then_else_async( + pub fn unchecked_if_then_else( &self, condition: &CudaBooleanBlock, true_ct: &T, @@ -21,80 +17,70 @@ impl CudaServerKey { let mut result: T = self .create_trivial_zero_radix(true_ct.as_ref().d_blocks.lwe_ciphertext_count().0, stream); - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - unchecked_cmux_integer_radix_kb_async( - stream, - &mut result.as_mut().d_blocks.0.d_vec, - &condition.as_ref().ciphertext.d_blocks.0.d_vec, - &true_ct.as_ref().d_blocks.0.d_vec, - &false_ct.as_ref().d_blocks.0.d_vec, - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - lwe_ciphertext_count.0 as u32, - PBSType::Classical, - LweBskGroupingFactor(0), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - unchecked_cmux_integer_radix_kb_async( - stream, - &mut result.as_mut().d_blocks.0.d_vec, - &condition.as_ref().ciphertext.d_blocks.0.d_vec, - &true_ct.as_ref().d_blocks.0.d_vec, - &false_ct.as_ref().d_blocks.0.d_vec, - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - self.message_modulus, - self.carry_modulus, - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - lwe_ciphertext_count.0 as u32, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - ); + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + unchecked_cmux_integer_radix_kb( + stream, + result.as_mut(), + condition, + true_ct.as_ref(), + false_ct.as_ref(), + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + lwe_ciphertext_count.0 as u32, + PBSType::Classical, + LweBskGroupingFactor(0), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + unchecked_cmux_integer_radix_kb( + stream, + result.as_mut(), + condition, + true_ct.as_ref(), + false_ct.as_ref(), + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + lwe_ciphertext_count.0 as u32, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + ); + } } } result.as_mut().info = true_ct.as_ref().info.after_if_then_else(); result } - pub fn unchecked_if_then_else( - &self, - condition: &CudaBooleanBlock, - true_ct: &T, - false_ct: &T, - stream: &CudaStreams, - ) -> T { - let result = - unsafe { self.unchecked_if_then_else_async(condition, true_ct, false_ct, stream) }; - stream.synchronize(); - result - } pub fn if_then_else( &self, @@ -106,26 +92,22 @@ impl CudaServerKey { let mut tmp_true_ct; let mut tmp_false_ct; - let result = unsafe { - let true_ct = if true_ct.block_carries_are_empty() { - true_ct - } else { - tmp_true_ct = true_ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_true_ct, stream); - &tmp_true_ct - }; - - let false_ct = if false_ct.block_carries_are_empty() { - false_ct - } else { - tmp_false_ct = false_ct.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_false_ct, stream); - &tmp_false_ct - }; + let true_ct = if true_ct.block_carries_are_empty() { + true_ct + } else { + tmp_true_ct = true_ct.duplicate(stream); + self.full_propagate_assign(&mut tmp_true_ct, stream); + &tmp_true_ct + }; - self.unchecked_if_then_else_async(condition, true_ct, false_ct, stream) + let false_ct = if false_ct.block_carries_are_empty() { + false_ct + } else { + tmp_false_ct = false_ct.duplicate(stream); + self.full_propagate_assign(&mut tmp_false_ct, stream); + &tmp_false_ct }; - stream.synchronize(); - result + + self.unchecked_if_then_else(condition, true_ct, false_ct, stream) } } diff --git a/tfhe/src/integer/gpu/server_key/radix/comparison.rs b/tfhe/src/integer/gpu/server_key/radix/comparison.rs index 26d728747d..41b8902d97 100644 --- a/tfhe/src/integer/gpu/server_key/radix/comparison.rs +++ b/tfhe/src/integer/gpu/server_key/radix/comparison.rs @@ -285,20 +285,20 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { tmp_lhs = ct_left.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, ct_right) } (false, false) => { tmp_lhs = ct_left.duplicate_async(streams); tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -379,20 +379,20 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { tmp_lhs = ct_left.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, ct_right) } (false, false) => { tmp_lhs = ct_left.duplicate_async(streams); tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -558,20 +558,20 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { tmp_lhs = ct_left.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, ct_right) } (false, false) => { tmp_lhs = ct_left.duplicate_async(streams); tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -666,20 +666,20 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { tmp_lhs = ct_left.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, ct_right) } (false, false) => { tmp_lhs = ct_left.duplicate_async(streams); tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -790,20 +790,20 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { tmp_lhs = ct_left.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, ct_right) } (false, false) => { tmp_lhs = ct_left.duplicate_async(streams); tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -914,20 +914,20 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { tmp_lhs = ct_left.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, ct_right) } (false, false) => { tmp_lhs = ct_left.duplicate_async(streams); tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -1161,20 +1161,20 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { tmp_lhs = ct_left.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, ct_right) } (false, false) => { tmp_lhs = ct_left.duplicate_async(streams); tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -1208,20 +1208,20 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { tmp_lhs = ct_left.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, ct_right) } (false, false) => { tmp_lhs = ct_left.duplicate_async(streams); tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; diff --git a/tfhe/src/integer/gpu/server_key/radix/div_mod.rs b/tfhe/src/integer/gpu/server_key/radix/div_mod.rs index de1ce80a8d..4a7994edcc 100644 --- a/tfhe/src/integer/gpu/server_key/radix/div_mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/div_mod.rs @@ -106,8 +106,8 @@ impl CudaServerKey { where T: CudaIntegerRadixCiphertext, { - let mut quotient = unsafe { numerator.duplicate_async(streams) }; - let mut remainder = unsafe { numerator.duplicate_async(streams) }; + let mut quotient = numerator.duplicate(streams); + let mut remainder = numerator.duplicate(streams); unsafe { self.unchecked_div_rem_assign_async( @@ -135,22 +135,20 @@ impl CudaServerKey { ) { (true, true) => (numerator, divisor), (true, false) => { - tmp_divisor = unsafe { divisor.duplicate_async(streams) }; - unsafe { self.full_propagate_assign_async(&mut tmp_divisor, streams) }; + tmp_divisor = divisor.duplicate(streams); + self.full_propagate_assign(&mut tmp_divisor, streams); (numerator, &tmp_divisor) } (false, true) => { - tmp_numerator = unsafe { numerator.duplicate_async(streams) }; - unsafe { self.full_propagate_assign_async(&mut tmp_numerator, streams) }; + tmp_numerator = numerator.duplicate(streams); + self.full_propagate_assign(&mut tmp_numerator, streams); (&tmp_numerator, divisor) } (false, false) => { - tmp_divisor = unsafe { divisor.duplicate_async(streams) }; - tmp_numerator = unsafe { numerator.duplicate_async(streams) }; - unsafe { - self.full_propagate_assign_async(&mut tmp_numerator, streams); - self.full_propagate_assign_async(&mut tmp_divisor, streams); - } + tmp_divisor = divisor.duplicate(streams); + tmp_numerator = numerator.duplicate(streams); + self.full_propagate_assign(&mut tmp_numerator, streams); + self.full_propagate_assign(&mut tmp_divisor, streams); (&tmp_numerator, &tmp_divisor) } }; @@ -177,22 +175,20 @@ impl CudaServerKey { ) { (true, true) => (numerator, divisor), (true, false) => { - tmp_divisor = unsafe { divisor.duplicate_async(streams) }; - unsafe { self.full_propagate_assign_async(&mut tmp_divisor, streams) }; + tmp_divisor = divisor.duplicate(streams); + self.full_propagate_assign(&mut tmp_divisor, streams); (numerator, &tmp_divisor) } (false, true) => { - tmp_numerator = unsafe { numerator.duplicate_async(streams) }; - unsafe { self.full_propagate_assign_async(&mut tmp_numerator, streams) }; + tmp_numerator = numerator.duplicate(streams); + self.full_propagate_assign(&mut tmp_numerator, streams); (&tmp_numerator, divisor) } (false, false) => { - tmp_divisor = unsafe { divisor.duplicate_async(streams) }; - tmp_numerator = unsafe { numerator.duplicate_async(streams) }; - unsafe { - self.full_propagate_assign_async(&mut tmp_numerator, streams); - self.full_propagate_assign_async(&mut tmp_divisor, streams); - } + tmp_divisor = divisor.duplicate(streams); + tmp_numerator = numerator.duplicate(streams); + self.full_propagate_assign(&mut tmp_numerator, streams); + self.full_propagate_assign(&mut tmp_divisor, streams); (&tmp_numerator, &tmp_divisor) } }; diff --git a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs index 43186f2310..198673d95c 100644 --- a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs +++ b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs @@ -847,7 +847,7 @@ impl CudaServerKey { ct } else { tmp = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp, streams); + self.full_propagate_assign(&mut tmp, streams); &tmp }; self.unchecked_trailing_zeros_async(ct, streams) @@ -920,7 +920,7 @@ impl CudaServerKey { ct } else { tmp = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp, streams); + self.full_propagate_assign(&mut tmp, streams); &tmp }; self.unchecked_trailing_ones_async(ct, streams) @@ -993,7 +993,7 @@ impl CudaServerKey { ct } else { tmp = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp, streams); + self.full_propagate_assign(&mut tmp, streams); &tmp }; self.unchecked_leading_zeros_async(ct, streams) @@ -1066,7 +1066,7 @@ impl CudaServerKey { ct } else { tmp = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp, streams); + self.full_propagate_assign(&mut tmp, streams); &tmp }; self.unchecked_leading_ones_async(ct, streams) @@ -1132,7 +1132,7 @@ impl CudaServerKey { ct } else { tmp = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp, streams); + self.full_propagate_assign(&mut tmp, streams); &tmp }; @@ -1207,7 +1207,7 @@ impl CudaServerKey { ct } else { tmp = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp, streams); + self.full_propagate_assign(&mut tmp, streams); &tmp }; diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs index 6c8031da2f..e4091abff7 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs @@ -379,59 +379,57 @@ impl CudaServerKey { carry_out } - /// # Safety - /// - /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must - /// not be dropped until streams is synchronized - pub(crate) unsafe fn full_propagate_assign_async( + pub(crate) fn full_propagate_assign( &self, ct: &mut T, streams: &CudaStreams, ) { let ciphertext = ct.as_mut(); let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32; - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - full_propagate_assign_async( - streams, - &mut ciphertext.d_blocks.0.d_vec, - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - d_bsk.input_lwe_dimension(), - d_bsk.glwe_dimension(), - d_bsk.polynomial_size(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count(), - d_bsk.decomp_base_log(), - num_blocks, - ciphertext.info.blocks.first().unwrap().message_modulus, - ciphertext.info.blocks.first().unwrap().carry_modulus, - PBSType::Classical, - LweBskGroupingFactor(0), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - full_propagate_assign_async( - streams, - &mut ciphertext.d_blocks.0.d_vec, - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - d_multibit_bsk.input_lwe_dimension(), - d_multibit_bsk.glwe_dimension(), - d_multibit_bsk.polynomial_size(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count(), - d_multibit_bsk.decomp_base_log(), - num_blocks, - ciphertext.info.blocks.first().unwrap().message_modulus, - ciphertext.info.blocks.first().unwrap().carry_modulus, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - ); - } - }; + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + full_propagate_assign_async( + streams, + &mut ciphertext.d_blocks.0.d_vec, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + d_bsk.input_lwe_dimension(), + d_bsk.glwe_dimension(), + d_bsk.polynomial_size(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count(), + d_bsk.decomp_base_log(), + num_blocks, + ciphertext.info.blocks.first().unwrap().message_modulus, + ciphertext.info.blocks.first().unwrap().carry_modulus, + PBSType::Classical, + LweBskGroupingFactor(0), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + full_propagate_assign_async( + streams, + &mut ciphertext.d_blocks.0.d_vec, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + d_multibit_bsk.input_lwe_dimension(), + d_multibit_bsk.glwe_dimension(), + d_multibit_bsk.polynomial_size(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count(), + d_multibit_bsk.decomp_base_log(), + num_blocks, + ciphertext.info.blocks.first().unwrap().message_modulus, + ciphertext.info.blocks.first().unwrap().carry_modulus, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + ); + } + }; + } ciphertext.info.blocks.iter_mut().for_each(|b| { b.degree = Degree::new(b.message_modulus.0 - 1); b.noise_level = if b.noise_level == NoiseLevel::ZERO { @@ -440,6 +438,7 @@ impl CudaServerKey { NoiseLevel::NOMINAL }; }); + streams.synchronize(); } /// Prepend trivial zero LSB blocks to an existing [`CudaUnsignedRadixCiphertext`] or @@ -1347,7 +1346,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !source.block_carries_are_empty() { - self.full_propagate_assign_async(&mut source, streams); + self.full_propagate_assign(&mut source, streams); } let current_num_blocks = source.as_ref().info.blocks.len(); if T::IS_SIGNED { @@ -1453,7 +1452,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !source.block_carries_are_empty() { - self.full_propagate_assign_async(&mut source, streams); + self.full_propagate_assign(&mut source, streams); } let current_num_blocks = source.as_ref().info.blocks.len(); diff --git a/tfhe/src/integer/gpu/server_key/radix/mul.rs b/tfhe/src/integer/gpu/server_key/radix/mul.rs index e9f568bbf0..ee27fbc370 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mul.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mul.rs @@ -214,18 +214,18 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { - self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign(ct_left, streams); (ct_left, ct_right) } (false, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(ct_left, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(ct_left, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } }; diff --git a/tfhe/src/integer/gpu/server_key/radix/neg.rs b/tfhe/src/integer/gpu/server_key/radix/neg.rs index 6cd81e26ac..6e2d0d456e 100644 --- a/tfhe/src/integer/gpu/server_key/radix/neg.rs +++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs @@ -142,7 +142,7 @@ impl CudaServerKey { ctxt } else { tmp_ctxt = ctxt.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_ctxt, streams); + self.full_propagate_assign(&mut tmp_ctxt, streams); &mut tmp_ctxt }; diff --git a/tfhe/src/integer/gpu/server_key/radix/rotate.rs b/tfhe/src/integer/gpu/server_key/radix/rotate.rs index b7f6348454..042e06342c 100644 --- a/tfhe/src/integer/gpu/server_key/radix/rotate.rs +++ b/tfhe/src/integer/gpu/server_key/radix/rotate.rs @@ -271,20 +271,20 @@ impl CudaServerKey { (true, true) => (ct, rotate), (true, false) => { tmp_rhs = rotate.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct, &tmp_rhs) } (false, true) => { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, rotate) } (false, false) => { tmp_lhs = ct.duplicate_async(streams); tmp_rhs = rotate.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -316,20 +316,20 @@ impl CudaServerKey { (true, true) => (ct, rotate), (true, false) => { tmp_rhs = rotate.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct, &tmp_rhs) } (false, true) => { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&mut tmp_lhs, rotate) } (false, false) => { tmp_lhs = ct.duplicate_async(streams); tmp_rhs = rotate.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&mut tmp_lhs, &tmp_rhs) } }; @@ -425,20 +425,20 @@ impl CudaServerKey { (true, true) => (ct, rotate), (true, false) => { tmp_rhs = rotate.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct, &tmp_rhs) } (false, true) => { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, rotate) } (false, false) => { tmp_lhs = ct.duplicate_async(streams); tmp_rhs = rotate.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -470,20 +470,20 @@ impl CudaServerKey { (true, true) => (ct, rotate), (true, false) => { tmp_rhs = rotate.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct, &tmp_rhs) } (false, true) => { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&mut tmp_lhs, rotate) } (false, false) => { tmp_lhs = ct.duplicate_async(streams); tmp_rhs = rotate.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&mut tmp_lhs, &tmp_rhs) } }; diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs index 597d0e2e55..0d0fb8fc5e 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs @@ -60,7 +60,7 @@ impl CudaServerKey { Scalar: DecomposableInto + CastInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(streams) }; + let mut result = ct.duplicate(streams); self.unchecked_scalar_add_assign(&mut result, scalar, streams); result } @@ -166,7 +166,7 @@ impl CudaServerKey { Scalar: DecomposableInto + CastInto, T: CudaIntegerRadixCiphertext, { - let mut result = unsafe { ct.duplicate_async(streams) }; + let mut result = ct.duplicate(streams); self.scalar_add_assign(&mut result, scalar, streams); result } @@ -185,7 +185,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); + self.full_propagate_assign(ct, streams); }; self.unchecked_scalar_add_assign_async(ct, scalar, streams); @@ -213,9 +213,7 @@ impl CudaServerKey { Scalar: DecomposableInto + CastInto, { let mut result; - unsafe { - result = ct_left.duplicate_async(stream); - } + result = ct_left.duplicate(stream); let overflowed = self.unsigned_overflowing_scalar_add_assign(&mut result, scalar, stream); (result, overflowed) } @@ -229,10 +227,8 @@ impl CudaServerKey { where Scalar: DecomposableInto + CastInto, { - unsafe { - if !ct_left.block_carries_are_empty() { - self.full_propagate_assign_async(ct_left, stream); - } + if !ct_left.block_carries_are_empty() { + self.full_propagate_assign(ct_left, stream); } self.unchecked_unsigned_overflowing_scalar_add_assign(ct_left, scalar, stream) } @@ -247,9 +243,7 @@ impl CudaServerKey { Scalar: DecomposableInto + CastInto, { let mut result; - unsafe { - result = ct_left.duplicate_async(stream); - } + result = ct_left.duplicate(stream); let overflowed = self.unchecked_unsigned_overflowing_scalar_add_assign(&mut result, scalar, stream); (result, overflowed) @@ -334,11 +328,9 @@ impl CudaServerKey { Scalar: SignedNumeric + DecomposableInto + CastInto, { let mut tmp_lhs; - unsafe { - tmp_lhs = ct_left.duplicate_async(streams); - if !tmp_lhs.block_carries_are_empty() { - self.full_propagate_assign_async(&mut tmp_lhs, streams); - } + tmp_lhs = ct_left.duplicate(streams); + if !tmp_lhs.block_carries_are_empty() { + self.full_propagate_assign(&mut tmp_lhs, streams); } let trivial: CudaSignedRadixCiphertext = self.create_trivial_radix( diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs index 337abdd928..4b86abcee5 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs @@ -193,7 +193,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); + self.full_propagate_assign(ct, streams); } self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams); ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs); @@ -234,7 +234,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); + self.full_propagate_assign(ct, streams); } self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams); ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs); @@ -275,7 +275,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); + self.full_propagate_assign(ct, streams); } self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams); ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs); diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs index 848f85cc26..f436b986b0 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs @@ -606,7 +606,7 @@ impl CudaServerKey { ct } else { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); &tmp_lhs }; @@ -688,7 +688,7 @@ impl CudaServerKey { ct } else { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); &tmp_lhs }; @@ -929,7 +929,7 @@ impl CudaServerKey { ct } else { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); &tmp_lhs }; @@ -970,7 +970,7 @@ impl CudaServerKey { ct } else { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); &tmp_lhs }; @@ -1011,7 +1011,7 @@ impl CudaServerKey { ct } else { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); &tmp_lhs }; @@ -1051,7 +1051,7 @@ impl CudaServerKey { ct } else { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); &tmp_lhs }; @@ -1156,7 +1156,7 @@ impl CudaServerKey { ct } else { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); &tmp_lhs }; @@ -1192,7 +1192,7 @@ impl CudaServerKey { ct } else { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); &tmp_lhs }; diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs index fee26b4625..5524a1ccd2 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs @@ -296,7 +296,7 @@ impl CudaServerKey { numerator } else { tmp_numerator = numerator.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_numerator, streams); + self.full_propagate_assign(&mut tmp_numerator, streams); &tmp_numerator }; @@ -425,7 +425,7 @@ impl CudaServerKey { numerator } else { tmp_numerator = numerator.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_numerator, streams); + self.full_propagate_assign(&mut tmp_numerator, streams); &tmp_numerator }; @@ -536,7 +536,7 @@ impl CudaServerKey { numerator } else { tmp_numerator = numerator.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_numerator, streams); + self.full_propagate_assign(&mut tmp_numerator, streams); &tmp_numerator }; @@ -776,7 +776,7 @@ impl CudaServerKey { numerator } else { tmp_numerator = numerator.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_numerator, streams); + self.full_propagate_assign(&mut tmp_numerator, streams); &tmp_numerator }; @@ -894,7 +894,7 @@ impl CudaServerKey { numerator } else { tmp_numerator = numerator.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_numerator, streams); + self.full_propagate_assign(&mut tmp_numerator, streams); &tmp_numerator }; @@ -1006,7 +1006,7 @@ impl CudaServerKey { numerator } else { tmp_numerator = numerator.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_numerator, streams); + self.full_propagate_assign(&mut tmp_numerator, streams); &tmp_numerator }; diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs index 7d0383d162..0a87134ee0 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs @@ -239,7 +239,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); + self.full_propagate_assign(ct, streams); }; self.unchecked_scalar_mul_assign_async(ct, scalar, streams); diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs index 240e2144f4..1bddc7ec51 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs @@ -231,9 +231,7 @@ impl CudaServerKey { u32: CastFrom, { if !ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(ct, stream); - } + self.full_propagate_assign(ct, stream); } unsafe { self.unchecked_scalar_rotate_left_assign_async(ct, n, stream) }; @@ -247,9 +245,7 @@ impl CudaServerKey { u32: CastFrom, { if !ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(ct, stream); - } + self.full_propagate_assign(ct, stream); } unsafe { self.unchecked_scalar_rotate_right_assign_async(ct, n, stream) }; @@ -262,7 +258,7 @@ impl CudaServerKey { Scalar: CastFrom, u32: CastFrom, { - let mut result = unsafe { ct.duplicate_async(stream) }; + let mut result = ct.duplicate(stream); self.scalar_rotate_left_assign(&mut result, shift, stream); result } @@ -273,7 +269,7 @@ impl CudaServerKey { Scalar: CastFrom, u32: CastFrom, { - let mut result = unsafe { ct.duplicate_async(stream) }; + let mut result = ct.duplicate(stream); self.scalar_rotate_right_assign(&mut result, shift, stream); result } diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs index a50a04e063..d27074f56d 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs @@ -371,7 +371,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); + self.full_propagate_assign(ct, streams); } self.unchecked_scalar_right_shift_assign_async(ct, shift, streams); @@ -459,7 +459,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); + self.full_propagate_assign(ct, streams); } self.unchecked_scalar_left_shift_assign_async(ct, shift, streams); @@ -542,11 +542,11 @@ impl CudaServerKey { u32: CastFrom, T: CudaIntegerRadixCiphertext, { - unsafe { - if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); - } + if !ct.block_carries_are_empty() { + self.full_propagate_assign(ct, streams); + } + unsafe { self.unchecked_scalar_left_shift_assign_async(ct, shift, streams); }; streams.synchronize(); @@ -562,11 +562,11 @@ impl CudaServerKey { u32: CastFrom, T: CudaIntegerRadixCiphertext, { - unsafe { - if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); - } + if !ct.block_carries_are_empty() { + self.full_propagate_assign(ct, streams); + } + unsafe { self.unchecked_scalar_right_shift_assign_async(ct, shift, streams); }; streams.synchronize(); diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs index cefdf2dfde..9074d681c2 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs @@ -155,7 +155,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if !ct.block_carries_are_empty() { - self.full_propagate_assign_async(ct, streams); + self.full_propagate_assign(ct, streams); }; self.unchecked_scalar_sub_assign_async(ct, scalar, streams); @@ -221,7 +221,7 @@ impl CudaServerKey { unsafe { tmp_lhs = ct_left.duplicate_async(streams); if !tmp_lhs.block_carries_are_empty() { - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); } } diff --git a/tfhe/src/integer/gpu/server_key/radix/shift.rs b/tfhe/src/integer/gpu/server_key/radix/shift.rs index 1a7f58f69d..7115695a0d 100644 --- a/tfhe/src/integer/gpu/server_key/radix/shift.rs +++ b/tfhe/src/integer/gpu/server_key/radix/shift.rs @@ -267,20 +267,20 @@ impl CudaServerKey { (true, true) => (ct, shift), (true, false) => { tmp_rhs = shift.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct, &tmp_rhs) } (false, true) => { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, shift) } (false, false) => { tmp_lhs = ct.duplicate_async(streams); tmp_rhs = shift.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -312,20 +312,20 @@ impl CudaServerKey { (true, true) => (ct, shift), (true, false) => { tmp_rhs = shift.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct, &tmp_rhs) } (false, true) => { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&mut tmp_lhs, shift) } (false, false) => { tmp_lhs = ct.duplicate_async(streams); tmp_rhs = shift.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&mut tmp_lhs, &tmp_rhs) } }; @@ -420,20 +420,20 @@ impl CudaServerKey { (true, true) => (ct, shift), (true, false) => { tmp_rhs = shift.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct, &tmp_rhs) } (false, true) => { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&tmp_lhs, shift) } (false, false) => { tmp_lhs = ct.duplicate_async(streams); tmp_rhs = shift.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&tmp_lhs, &tmp_rhs) } }; @@ -465,20 +465,20 @@ impl CudaServerKey { (true, true) => (ct, shift), (true, false) => { tmp_rhs = shift.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct, &tmp_rhs) } (false, true) => { tmp_lhs = ct.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); (&mut tmp_lhs, shift) } (false, false) => { tmp_lhs = ct.duplicate_async(streams); tmp_rhs = shift.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_lhs, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_lhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (&mut tmp_lhs, &tmp_rhs) } }; diff --git a/tfhe/src/integer/gpu/server_key/radix/sub.rs b/tfhe/src/integer/gpu/server_key/radix/sub.rs index 6e2bbfa990..ce6120b116 100644 --- a/tfhe/src/integer/gpu/server_key/radix/sub.rs +++ b/tfhe/src/integer/gpu/server_key/radix/sub.rs @@ -94,7 +94,7 @@ impl CudaServerKey { streams: &CudaStreams, ) { let neg = self.unchecked_neg_async(ct_right, streams); - self.unchecked_add_assign_async(ct_left, &neg, streams); + self.unchecked_add_assign(ct_left, &neg, streams); } /// Computes homomorphically a subtraction between two ciphertexts encrypting integer values. @@ -257,18 +257,18 @@ impl CudaServerKey { (true, true) => (ct_left, ct_right), (true, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } (false, true) => { - self.full_propagate_assign_async(ct_left, streams); + self.full_propagate_assign(ct_left, streams); (ct_left, ct_right) } (false, false) => { tmp_rhs = ct_right.duplicate_async(streams); - self.full_propagate_assign_async(ct_left, streams); - self.full_propagate_assign_async(&mut tmp_rhs, streams); + self.full_propagate_assign(ct_left, streams); + self.full_propagate_assign(&mut tmp_rhs, streams); (ct_left, &tmp_rhs) } }; @@ -299,14 +299,14 @@ impl CudaServerKey { (true, false) => { unsafe { tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign(&mut tmp_rhs, stream); } (ct_left, &tmp_rhs) } (false, true) => { unsafe { tmp_lhs = ct_left.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + self.full_propagate_assign(&mut tmp_lhs, stream); } (&tmp_lhs, ct_right) } @@ -315,8 +315,8 @@ impl CudaServerKey { tmp_lhs = ct_left.duplicate_async(stream); tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign(&mut tmp_lhs, stream); + self.full_propagate_assign(&mut tmp_rhs, stream); } (&tmp_lhs, &tmp_rhs) @@ -521,14 +521,14 @@ impl CudaServerKey { (true, false) => { unsafe { tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign(&mut tmp_rhs, stream); } (ct_left, &tmp_rhs) } (false, true) => { unsafe { tmp_lhs = ct_left.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); + self.full_propagate_assign(&mut tmp_lhs, stream); } (&tmp_lhs, ct_right) } @@ -537,8 +537,8 @@ impl CudaServerKey { tmp_lhs = ct_left.duplicate_async(stream); tmp_rhs = ct_right.duplicate_async(stream); - self.full_propagate_assign_async(&mut tmp_lhs, stream); - self.full_propagate_assign_async(&mut tmp_rhs, stream); + self.full_propagate_assign(&mut tmp_lhs, stream); + self.full_propagate_assign(&mut tmp_rhs, stream); } (&tmp_lhs, &tmp_rhs) diff --git a/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs b/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs index a0433ebbe6..3feac0d287 100644 --- a/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs +++ b/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs @@ -258,11 +258,9 @@ impl CudaServerKey { let lhs = if lhs.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in lhs.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_lhs.push(temp_ct); } @@ -275,11 +273,9 @@ impl CudaServerKey { let rhs = if rhs.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in rhs.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_rhs.push(temp_ct); } @@ -413,11 +409,9 @@ impl CudaServerKey { let lhs = if lhs.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in lhs.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_lhs.push(temp_ct); } @@ -430,11 +424,9 @@ impl CudaServerKey { let rhs = if rhs.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in rhs.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_rhs.push(temp_ct); } diff --git a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs index 56ff9642b3..7bbb3b8be8 100644 --- a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs +++ b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs @@ -250,9 +250,7 @@ impl CudaServerKey { self.unchecked_match_value(ct, matches, streams) } else { let mut clone = ct.duplicate(streams); - unsafe { - self.full_propagate_assign_async(&mut clone, streams); - } + self.full_propagate_assign(&mut clone, streams); self.unchecked_match_value(&clone, matches, streams) } } @@ -366,9 +364,7 @@ impl CudaServerKey { self.unchecked_match_value_or(ct, matches, or_value, streams) } else { let mut clone = ct.duplicate(streams); - unsafe { - self.full_propagate_assign_async(&mut clone, streams); - } + self.full_propagate_assign(&mut clone, streams); self.unchecked_match_value_or(&clone, matches, or_value, streams) } } @@ -446,11 +442,9 @@ impl CudaServerKey { let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in cts.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_cts.push(temp_ct); } @@ -464,9 +458,7 @@ impl CudaServerKey { value } else { tmp_value = value.duplicate(streams); - unsafe { - self.full_propagate_assign_async(&mut tmp_value, streams); - } + self.full_propagate_assign(&mut tmp_value, streams); &tmp_value }; @@ -550,11 +542,9 @@ impl CudaServerKey { let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in cts.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_cts.push(temp_ct); } @@ -642,9 +632,7 @@ impl CudaServerKey { ct } else { tmp_ct = ct.duplicate(streams); - unsafe { - self.full_propagate_assign_async(&mut tmp_ct, streams); - } + self.full_propagate_assign(&mut tmp_ct, streams); &tmp_ct }; self.unchecked_is_in_clears(ct, clears, streams) @@ -744,9 +732,7 @@ impl CudaServerKey { ct } else { tmp_ct = ct.duplicate(streams); - unsafe { - self.full_propagate_assign_async(&mut tmp_ct, streams); - } + self.full_propagate_assign(&mut tmp_ct, streams); streams.synchronize(); &tmp_ct }; @@ -873,9 +859,7 @@ impl CudaServerKey { ct } else { tmp_ct = ct.duplicate(streams); - unsafe { - self.full_propagate_assign_async(&mut tmp_ct, streams); - } + self.full_propagate_assign(&mut tmp_ct, streams); streams.synchronize(); &tmp_ct }; @@ -978,11 +962,9 @@ impl CudaServerKey { let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in cts.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_cts.push(temp_ct); } @@ -996,9 +978,7 @@ impl CudaServerKey { value } else { tmp_value = value.duplicate(streams); - unsafe { - self.full_propagate_assign_async(&mut tmp_value, streams); - } + self.full_propagate_assign(&mut tmp_value, streams); &tmp_value }; self.unchecked_index_of(cts, value, streams) @@ -1100,11 +1080,9 @@ impl CudaServerKey { let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in cts.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_cts.push(temp_ct); } @@ -1231,11 +1209,9 @@ impl CudaServerKey { let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in cts.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_cts.push(temp_ct); } @@ -1365,11 +1341,9 @@ impl CudaServerKey { let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) { // Need a way to parallelize this step for ct in cts.iter() { - let mut temp_ct = unsafe { ct.duplicate_async(streams) }; + let mut temp_ct = ct.duplicate(streams); if !temp_ct.block_carries_are_empty() { - unsafe { - self.full_propagate_assign_async(&mut temp_ct, streams); - } + self.full_propagate_assign(&mut temp_ct, streams); } tmp_cts.push(temp_ct); } @@ -1383,9 +1357,7 @@ impl CudaServerKey { value } else { tmp_value = value.duplicate(streams); - unsafe { - self.full_propagate_assign_async(&mut tmp_value, streams); - } + self.full_propagate_assign(&mut tmp_value, streams); &tmp_value }; self.unchecked_first_index_of(cts, value, streams) @@ -1640,7 +1612,7 @@ impl CudaServerKey { streams, ); } - let mut temp = unsafe { aggregated_vector.duplicate_async(streams) }; + let mut temp = aggregated_vector.duplicate(streams); let mut aggregated_mut_slice = aggregated_vector .as_mut() .d_blocks @@ -1712,13 +1684,11 @@ impl CudaServerKey { let last_chunk_size = one_hot_vector.len() - (num_chunks - 1) * chunk_size; for ct_idx in 0..last_chunk_size { let one_hot_idx = (num_chunks - 1) * chunk_size + ct_idx; - unsafe { - self.unchecked_add_assign_async( - &mut aggregated_vector, - &one_hot_vector[one_hot_idx], - streams, - ); - } + self.unchecked_add_assign( + &mut aggregated_vector, + &one_hot_vector[one_hot_idx], + streams, + ); } let message_extract_lut = @@ -1745,7 +1715,7 @@ impl CudaServerKey { .as_mut_slice(0..lwe_size * num_ct_blocks, 0) .unwrap(); unsafe { - let mut temp = aggregated_vector.duplicate_async(streams); + let mut temp = aggregated_vector.duplicate(streams); let aggregated_slice = temp .as_mut() .d_blocks @@ -1805,7 +1775,7 @@ impl CudaServerKey { } } unsafe { - let mut temp = aggregated_vector.duplicate_async(streams); + let mut temp = aggregated_vector.duplicate(streams); let aggregated_slice = temp .as_mut() .d_blocks @@ -1941,7 +1911,7 @@ impl CudaServerKey { let mut first_true: T = unsafe { self.create_trivial_zero_radix_async(num_ct_blocks, streams) }; - let mut clone_ct = unsafe { values.duplicate_async(streams) }; + let mut clone_ct = values.duplicate(streams); let mut slice_in = clone_ct .as_mut() .d_blocks @@ -2022,7 +1992,7 @@ impl CudaServerKey { } }); - let cloned_ct = unsafe { first_true.duplicate_async(streams) }; + let cloned_ct = first_true.duplicate(streams); let slice_in_final = cloned_ct .as_ref() .d_blocks diff --git a/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs b/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs index 93a63d6699..0790444f32 100644 --- a/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs +++ b/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs @@ -3,4 +3,4 @@ pub(crate) mod test_random_op_sequence; pub(crate) mod test_signed_erc20; pub(crate) mod test_signed_random_op_sequence; pub(crate) const NB_CTXT_LONG_RUN: usize = 32; -pub(crate) const NB_TESTS_LONG_RUN: usize = 20000; +pub(crate) const NB_TESTS_LONG_RUN: usize = 5; diff --git a/tfhe/src/shortint/ciphertext/common.rs b/tfhe/src/shortint/ciphertext/common.rs index 2ae8b23f07..74f04e3e34 100644 --- a/tfhe/src/shortint/ciphertext/common.rs +++ b/tfhe/src/shortint/ciphertext/common.rs @@ -65,7 +65,7 @@ impl MaxNoiseLevel { Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Serialize, Deserialize, Versionize, )] #[versionize(NoiseLevelVersions)] -pub struct NoiseLevel(u64); +pub struct NoiseLevel(pub(crate) u64); impl NoiseLevel { pub const NOMINAL: Self = Self(1); @@ -148,7 +148,7 @@ impl MaxDegree { Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Serialize, Deserialize, Versionize, )] #[versionize(DegreeVersions)] -pub struct Degree(pub(super) u64); +pub struct Degree(pub(crate) u64); impl Degree { pub fn new(degree: u64) -> Self {