From 2a4026c76191fd110536e9275d734e3d28c99069 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Wed, 21 Aug 2024 13:38:44 +0000 Subject: [PATCH] fix(gpu): fix some edge-cases (and booleans) on compression --- .github/workflows/gpu_fast_h100_tests.yml | 1 + .github/workflows/gpu_fast_tests.yml | 1 + .../workflows/gpu_full_multi_gpu_tests.yml | 4 + Makefile | 13 +++ .../cuda/include/compression.h | 11 +- .../src/integer/compression/compression.cu | 19 ++-- .../src/integer/compression/compression.cuh | 85 +++++++++++---- backends/tfhe-cuda-backend/src/cuda_bind.rs | 1 + .../ciphertext/compressed_ciphertext_list.rs | 103 ++++++++---------- .../gpu/list_compression/server_keys.rs | 6 +- tfhe/src/integer/gpu/mod.rs | 2 + 11 files changed, 150 insertions(+), 96 deletions(-) diff --git a/.github/workflows/gpu_fast_h100_tests.yml b/.github/workflows/gpu_fast_h100_tests.yml index f6cb839d1a..76366541be 100644 --- a/.github/workflows/gpu_fast_h100_tests.yml +++ b/.github/workflows/gpu_fast_h100_tests.yml @@ -147,6 +147,7 @@ jobs: - name: Run core crypto and internal CUDA backend tests run: | BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu + BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu BIG_TESTS_INSTANCE=TRUE make test_cuda_backend - name: Run user docs tests diff --git a/.github/workflows/gpu_fast_tests.yml b/.github/workflows/gpu_fast_tests.yml index 2a9bc9ac44..b2fc850ef1 100644 --- a/.github/workflows/gpu_fast_tests.yml +++ b/.github/workflows/gpu_fast_tests.yml @@ -145,6 +145,7 @@ jobs: - name: Run core crypto and internal CUDA backend tests run: | make test_core_crypto_gpu + make test_integer_compression_gpu make test_cuda_backend - name: Run user docs tests diff --git a/.github/workflows/gpu_full_multi_gpu_tests.yml b/.github/workflows/gpu_full_multi_gpu_tests.yml index 1f3c3db4bc..3f98e96e5c 100644 --- a/.github/workflows/gpu_full_multi_gpu_tests.yml +++ b/.github/workflows/gpu_full_multi_gpu_tests.yml @@ -144,6 +144,10 @@ jobs: if: ${{ !cancelled() }} run: nvidia-smi + - name: Run multi-bit CUDA integer compression tests + run: | + BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu + # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU. - name: Run multi-bit CUDA integer tests run: | diff --git a/Makefile b/Makefile index 7b5d576ba0..199c8034d6 100644 --- a/Makefile +++ b/Makefile @@ -481,6 +481,13 @@ test_integer_gpu: install_rs_build_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \ --features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: +.PHONY: test_integer_compression_gpu +test_integer_compression_gpu: install_rs_build_toolchain + RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \ + --features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::test_gpu_ciphertext_compression + RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \ + --features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress + .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \ @@ -883,6 +890,12 @@ bench_integer_gpu: install_rs_check_toolchain --bench integer-bench \ --features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- +.PHONY: bench_integer_compression_gpu +bench_integer_compression_gpu: install_rs_check_toolchain + RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ + --bench glwe_packing_compression-integer-bench \ + --features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) -- + .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters bench_integer_multi_bit: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \ diff --git a/backends/tfhe-cuda-backend/cuda/include/compression.h b/backends/tfhe-cuda-backend/cuda/include/compression.h index 6c1beab5b4..8fd02d677a 100644 --- a/backends/tfhe-cuda-backend/cuda/include/compression.h +++ b/backends/tfhe-cuda-backend/cuda/include/compression.h @@ -18,7 +18,8 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64( uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory); + PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count, + bool allocate_gpu_memory); void cuda_integer_compress_radix_ciphertext_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, @@ -94,6 +95,7 @@ template struct int_decompression { uint32_t storage_log_modulus; + uint32_t num_lwes; uint32_t body_count; Torus *tmp_extracted_glwe; @@ -104,12 +106,13 @@ template struct int_decompression { int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_radix_params encryption_params, int_radix_params compression_params, - uint32_t num_radix_blocks, uint32_t storage_log_modulus, - bool allocate_gpu_memory) { + uint32_t num_radix_blocks, uint32_t body_count, + uint32_t storage_log_modulus, bool allocate_gpu_memory) { this->encryption_params = encryption_params; this->compression_params = compression_params; this->storage_log_modulus = storage_log_modulus; - this->body_count = num_radix_blocks; + this->num_lwes = num_radix_blocks; + this->body_count = body_count; if (allocate_gpu_memory) { Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) * diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu index 5e0da5a8c8..841041f27b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu @@ -25,24 +25,25 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64( uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size, uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) { + PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count, + bool allocate_gpu_memory) { + // Decompression doesn't keyswitch, so big and small dimensions are the same int_radix_params encryption_params( pbs_type, encryption_glwe_dimension, encryption_polynomial_size, - (encryption_glwe_dimension + 1) * encryption_polynomial_size, - lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus, - carry_modulus); + lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, + message_modulus, carry_modulus); int_radix_params compression_params( pbs_type, compression_glwe_dimension, compression_polynomial_size, - (compression_glwe_dimension + 1) * compression_polynomial_size, - lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus, - carry_modulus); + lwe_dimension, compression_glwe_dimension * compression_polynomial_size, + 0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus); scratch_cuda_integer_decompress_radix_ciphertext_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, - (int_decompression **)mem_ptr, num_lwes, encryption_params, - compression_params, storage_log_modulus, allocate_gpu_memory); + (int_decompression **)mem_ptr, num_lwes, body_count, + encryption_params, compression_params, storage_log_modulus, + allocate_gpu_memory); } void cuda_integer_compress_radix_ciphertext_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh index 229495e324..26bd6befed 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh @@ -45,7 +45,6 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index, auto log_modulus = mem_ptr->storage_log_modulus; auto in_len = params.glwe_dimension * params.polynomial_size + body_count; auto number_bits_to_pack = in_len * log_modulus; - auto nbits = sizeof(Torus) * 8; // number_bits_to_pack.div_ceil(Scalar::BITS) auto len = (number_bits_to_pack + nbits - 1) / nbits; @@ -80,6 +79,7 @@ __host__ void host_integer_compress(cudaStream_t *streams, uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) * compression_params.polynomial_size; uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1; + auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe); // Keyswitch LWEs to GLWE auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out; @@ -92,11 +92,9 @@ __host__ void host_integer_compress(cudaStream_t *streams, streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0], fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension, compression_params.polynomial_size, compression_params.ks_base_log, - compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe)); + compression_params.ks_level, body_count); } - auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe); - // Modulus switch host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out, num_glwes * @@ -156,15 +154,15 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index, auto log_modulus = mem_ptr->storage_log_modulus; uint32_t body_count = mem_ptr->body_count; + auto initial_out_len = - params.glwe_dimension * params.polynomial_size + body_count * body_count; + params.glwe_dimension * params.polynomial_size + body_count; // We assure the tail of the glwe is zeroed - auto zeroed_slice = - glwe_array_out + params.glwe_dimension * params.polynomial_size; - cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus), + auto zeroed_slice = glwe_array_out + initial_out_len; + cuda_memset_async(zeroed_slice, 0, + (params.polynomial_size - body_count) * sizeof(Torus), stream, gpu_index); - int num_blocks = 0, num_threads = 0; getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads); dim3 grid(num_blocks); @@ -187,7 +185,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0, mem_ptr); - auto num_lwes = mem_ptr->body_count; + auto num_lwes = mem_ptr->num_lwes; // Sample extract auto extracted_lwe = mem_ptr->tmp_extracted_lwe; @@ -199,17 +197,58 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes, /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE /// dimension to a big LWE dimension auto encryption_params = mem_ptr->encryption_params; - auto carry_extract_lut = mem_ptr->carry_extract_lut; - execute_pbs_async( - streams, gpu_indexes, gpu_count, lwe_array_out, - carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec, - carry_extract_lut->lut_indexes_vec, extracted_lwe, - carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer, - encryption_params.glwe_dimension, - compression_params.glwe_dimension * compression_params.polynomial_size, - encryption_params.polynomial_size, encryption_params.pbs_base_log, - encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes, - encryption_params.pbs_type); + auto lut = mem_ptr->carry_extract_lut; + auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count); + if (active_gpu_count == 1) { + execute_pbs_async( + streams, gpu_indexes, active_gpu_count, lwe_array_out, + lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe, + lut->lwe_indexes_in, bsks, lut->buffer, + encryption_params.glwe_dimension, + compression_params.small_lwe_dimension, + encryption_params.polynomial_size, encryption_params.pbs_base_log, + encryption_params.pbs_level, encryption_params.grouping_factor, + num_lwes, encryption_params.pbs_type); + } else { + /// For multi GPU execution we create vectors of pointers for inputs and + /// outputs + std::vector lwe_array_in_vec = lut->lwe_array_in_vec; + std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec; + std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; + + /// Make sure all data that should be on GPU 0 is indeed there + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + + /// With multiple GPUs we push to the vectors on each GPU then when we + /// gather data to GPU 0 we can copy back to the original indexing + multi_gpu_scatter_lwe_async( + streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe, + lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes, + compression_params.small_lwe_dimension + 1); + + /// Apply PBS + execute_pbs_async( + streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, + lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, + lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, + encryption_params.glwe_dimension, + compression_params.small_lwe_dimension, + encryption_params.polynomial_size, encryption_params.pbs_base_log, + encryption_params.pbs_level, encryption_params.grouping_factor, + num_lwes, encryption_params.pbs_type); + + /// Copy data back to GPU 0 and release vecs + multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, + lwe_array_out, lwe_after_pbs_vec, + lut->h_lwe_indexes_out, + lut->using_trivial_lwe_indexes, num_lwes, + encryption_params.big_lwe_dimension + 1); + + /// Synchronize all GPUs + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } + } } template @@ -227,12 +266,12 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext_64( template __host__ void scratch_cuda_integer_decompress_radix_ciphertext_64( cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, - int_decompression **mem_ptr, uint32_t num_lwes, + int_decompression **mem_ptr, uint32_t num_lwes, uint32_t body_count, int_radix_params encryption_params, int_radix_params compression_params, uint32_t storage_log_modulus, bool allocate_gpu_memory) { *mem_ptr = new int_decompression( streams, gpu_indexes, gpu_count, encryption_params, compression_params, - num_lwes, storage_log_modulus, allocate_gpu_memory); + num_lwes, body_count, storage_log_modulus, allocate_gpu_memory); } #endif diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 740d99bc2f..3ce55b8503 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -120,6 +120,7 @@ extern "C" { carry_modulus: u32, pbs_type: u32, storage_log_modulus: u32, + bodies_count: u32, allocate_gpu_memory: bool, ); diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs index e83157a7ba..8cc1fab697 100644 --- a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs +++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs @@ -138,79 +138,64 @@ mod tests { use super::*; use crate::integer::gpu::gen_keys_radix_gpu; use crate::integer::ClientKey; - use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; - use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; + use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64; + const NB_TESTS: usize = 10; #[test] fn test_gpu_ciphertext_compression() { - let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); + let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); let private_compression_key = - cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64); + cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64); let streams = CudaStreams::new_multi_gpu(); - let num_blocks = 4; + let num_blocks = 32; let (radix_cks, _) = gen_keys_radix_gpu( - PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, + PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64, num_blocks, &streams, ); - let (cuda_compression_key, cuda_decompression_key) = radix_cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams); - let ct1 = radix_cks.encrypt(3_u32); - let ct2 = radix_cks.encrypt(2_u32); - let ct3 = radix_cks.encrypt_signed(-2); - let ct4 = cks.encrypt_bool(true); - - // Copy to GPU - let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams); - let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &streams); - let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &streams); - let d_ct4 = CudaBooleanBlock::from_boolean_block(&ct4, &streams); - - let cuda_compressed = CudaCompressedCiphertextListBuilder::new() - .push(d_ct1, &streams) - .push(d_ct2, &streams) - .push(d_ct3, &streams) - .push(d_ct4, &streams) - .build(&cuda_compression_key, &streams); - - let d_decompressed1 = CudaUnsignedRadixCiphertext { - ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams), - }; - - let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams); - let decrypted: u32 = radix_cks.decrypt(&decompressed1); - - assert_eq!(decrypted, 3_u32); - let d_decompressed2 = CudaUnsignedRadixCiphertext { - ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams), - }; - - let decompressed2 = d_decompressed2.to_radix_ciphertext(&streams); - let decrypted: u32 = radix_cks.decrypt(&decompressed2); - - assert_eq!(decrypted, 2_u32); - let d_decompressed3 = CudaSignedRadixCiphertext { - ciphertext: cuda_compressed.get(2, &cuda_decompression_key, &streams), - }; - - let decompressed3 = d_decompressed3.to_signed_radix_ciphertext(&streams); - let decrypted: i32 = radix_cks.decrypt_signed(&decompressed3); - - assert_eq!(decrypted, -2); - let d_decompressed4 = CudaBooleanBlock::from_cuda_radix_ciphertext(cuda_compressed.get( - 3, - &cuda_decompression_key, - &streams, - )); - - let decompressed4 = d_decompressed4.to_boolean_block(&streams); - let decrypted = radix_cks.decrypt_bool(&decompressed4); - - assert!(decrypted); + for _ in 0..NB_TESTS { + let ct1 = radix_cks.encrypt(3_u32); + let ct2 = radix_cks.encrypt_signed(-2); + let ct3 = radix_cks.encrypt_bool(true); + + // Copy to GPU + let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams); + let d_ct2 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct2, &streams); + let d_ct3 = CudaBooleanBlock::from_boolean_block(&ct3, &streams); + + let cuda_compressed = CudaCompressedCiphertextListBuilder::new() + .push(d_ct1, &streams) + .push(d_ct2, &streams) + .push(d_ct3, &streams) + .build(&cuda_compression_key, &streams); + + let d_decompressed1 = CudaUnsignedRadixCiphertext { + ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams), + }; + let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams); + let decrypted: u32 = radix_cks.decrypt(&decompressed1); + assert_eq!(decrypted, 3_u32); + + let d_decompressed2 = CudaSignedRadixCiphertext { + ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams), + }; + let decompressed2 = d_decompressed2.to_signed_radix_ciphertext(&streams); + let decrypted: i32 = radix_cks.decrypt_signed(&decompressed2); + assert_eq!(decrypted, -2); + + let d_decompressed3 = CudaBooleanBlock::from_cuda_radix_ciphertext( + cuda_compressed.get(2, &cuda_decompression_key, &streams), + ); + let decompressed3 = d_decompressed3.to_boolean_block(&streams); + let decrypted = radix_cks.decrypt_bool(&decompressed3); + assert!(decrypted); + } } } diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs index 6b34256446..bd7d2e12d5 100644 --- a/tfhe/src/integer/gpu/list_compression/server_keys.rs +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -30,6 +30,7 @@ pub struct CudaDecompressionKey { pub struct CudaPackedGlweCiphertext { pub glwe_ciphertext_list: CudaGlweCiphertextList, pub block_info: Vec, + pub bodies_count: usize, pub storage_log_modulus: CiphertextModulusLog, } @@ -161,6 +162,7 @@ impl CudaCompressionKey { CudaPackedGlweCiphertext { glwe_ciphertext_list: output_glwe, block_info: info, + bodies_count: num_lwes, storage_log_modulus: self.storage_log_modulus, } } @@ -184,6 +186,7 @@ impl CudaDecompressionKey { let compression_glwe_dimension = glwe_ciphertext_list.glwe_dimension(); let compression_polynomial_size = glwe_ciphertext_list.polynomial_size(); let lwe_ciphertext_count = LweCiphertextCount(indexes_array.len()); + let message_modulus = self.parameters.message_modulus(); let carry_modulus = self.parameters.carry_modulus(); let ciphertext_modulus = self.parameters.ciphertext_modulus(); @@ -210,6 +213,7 @@ impl CudaDecompressionKey { &mut output_lwe.0.d_vec, &glwe_ciphertext_list.0.d_vec, &bsk.d_vec, + packed_list.bodies_count as u32, message_modulus, carry_modulus, encryption_glwe_dimension, @@ -244,7 +248,7 @@ impl CudaDecompressionKey { } } CudaBootstrappingKey::MultiBit(_) => { - panic! {"Compression is currently not compatible with Multi Bit PBS"} + panic! {"Compression is currently not compatible with Multi-Bit PBS"} } } } diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 7aa2a705b1..361aca7129 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -361,6 +361,7 @@ pub unsafe fn decompress_integer_radix_async( lwe_array_out: &mut CudaVec, glwe_in: &CudaVec, bootstrapping_key: &CudaVec, + bodies_count: u32, message_modulus: MessageModulus, carry_modulus: CarryModulus, encryption_glwe_dimension: GlweDimension, @@ -407,6 +408,7 @@ pub unsafe fn decompress_integer_radix_async( carry_modulus.0 as u32, PBSType::Classical as u32, storage_log_modulus, + bodies_count, true, );