From cd28cff1888dc4b5e1e504426ed51987aaeb4233 Mon Sep 17 00:00:00 2001
From: Guillermo Oyarzun <guillermo.oyarzun@zama.ai>
Date: Mon, 13 Jan 2025 16:26:52 +0100
Subject: [PATCH] feat(gpu): implement fhe rand on gpu

---
 .../cuda/include/linear_algebra.h             |   5 +
 .../cuda/src/linearalgebra/addition.cu        |  38 +
 .../cuda/src/linearalgebra/addition.cuh       |  36 +
 backends/tfhe-cuda-backend/src/bindings.rs    |  11 +
 tfhe/benches/integer/bench.rs                 |  83 +-
 tfhe/src/core_crypto/gpu/mod.rs               |  25 +
 tfhe/src/high_level_api/booleans/oprf.rs      |  34 +-
 tfhe/src/high_level_api/integers/oprf.rs      |  69 +-
 tfhe/src/integer/gpu/server_key/radix/mod.rs  |  28 +-
 tfhe/src/integer/gpu/server_key/radix/oprf.rs | 736 ++++++++++++++++++
 .../server_key/radix/vector_comparisons.rs    |  13 +-
 .../gpu/server_key/radix/vector_find.rs       |  28 +-
 tfhe/src/shortint/engine/mod.rs               |  13 +-
 tfhe/src/shortint/oprf.rs                     |  55 +-
 tfhe/src/shortint/server_key/mod.rs           |   7 +-
 15 files changed, 1108 insertions(+), 73 deletions(-)
 create mode 100644 tfhe/src/integer/gpu/server_key/radix/oprf.rs
diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
index ddc9a2a508..69394e4af3 100644
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -44,6 +44,11 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_array_in, void const *cleartext_array_in,
     uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_plaintext_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint64_t plaintext_in,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
 }
 
 #endif // CUDA_LINALG_H_
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
index 03ded74b46..6e38bdebd8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -114,3 +114,41 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
       static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
       input_lwe_ciphertext_count);
 }
+
+/*
+ * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
+ * plaintext scalar.
+ * - `stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `plaintext_in` is the plaintext used as input.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output.
+ *
+ * The same input plaintext is added to the body of the
+ * LWE ciphertexts in the LWE ciphertext vector. The result of the
+ * operation is stored in the output LWE ciphertext vector. The two input
+ * vectors are unchanged. This function is a wrapper to a device function that
+ * performs the operation on the GPU.
+ */
+void cuda_add_lwe_ciphertext_vector_plaintext_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, const uint64_t plaintext_in,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {
+
+  host_addition_plaintext_scalar<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in), plaintext_in,
+      input_lwe_dimension, input_lwe_ciphertext_count);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
index 3401cdadd2..57f1a0ee99 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -27,6 +27,21 @@ plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input,
   }
 }
 
+template <typename T>
+__global__ void plaintext_addition_scalar(T *output, T const *lwe_input,
+                                          const T plaintext_input,
+                                          const uint32_t input_lwe_dimension,
+                                          const uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int lwe_index = blockIdx.x * blockDim.x + tid;
+  if (lwe_index < num_entries) {
+    int index = lwe_index * (input_lwe_dimension + 1) + input_lwe_dimension;
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] + plaintext_input;
+  }
+}
+
 template <typename T>
 __host__ void
 host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
@@ -48,6 +63,27 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
   check_cuda_error(cudaGetLastError());
 }
 
+template <typename T>
+__host__ void host_addition_plaintext_scalar(
+    cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input,
+    const T plaintext_input, const uint32_t lwe_dimension,
+    const uint32_t lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = lwe_ciphertext_count;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cuda_memcpy_async_gpu_to_gpu(
+      output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count * sizeof(T),
+      stream, gpu_index);
+  plaintext_addition_scalar<T><<<grid, thds, 0, stream>>>(
+      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename T>
 __global__ void addition(T *output, T const *input_1, T const *input_2,
                          uint32_t num_entries) {
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index 983a409338..fa16da0136 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1345,6 +1345,17 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
+extern "C" {
+    pub fn cuda_add_lwe_ciphertext_vector_plaintext_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        plaintext_in: u64,
+        input_lwe_dimension: u32,
+        input_lwe_ciphertext_count: u32,
+    );
+}
 extern "C" {
     pub fn cuda_fourier_polynomial_mul(
         stream: *mut ffi::c_void,
diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs
index d6ead085b7..8fb1200035 100644
--- a/tfhe/benches/integer/bench.rs
+++ b/tfhe/benches/integer/bench.rs
@@ -1307,11 +1307,12 @@ define_server_key_bench_default_fn!(
 #[cfg(feature = "gpu")]
 mod cuda {
     use super::*;
-    use criterion::criterion_group;
+    use criterion::{black_box, criterion_group};
     use tfhe::core_crypto::gpu::CudaStreams;
     use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
     use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
     use tfhe::integer::gpu::server_key::CudaServerKey;
+    use tfhe_csprng::seeders::Seed;
 
     fn bench_cuda_server_key_unary_function_clean_inputs<F>(
         c: &mut Criterion,
@@ -1731,6 +1732,84 @@ mod cuda {
         bench_group.finish()
     }
 
+    pub fn cuda_unsigned_oprf(c: &mut Criterion) {
+        let bench_name = "integer::cuda::unsigned_oprf";
+
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(30));
+
+        let streams = CudaStreams::new_multi_gpu();
+
+        for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
+            let param_name = param.name();
+
+            let bench_id;
+
+            match BENCH_TYPE.get().unwrap() {
+                BenchmarkType::Latency => {
+                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    bench_group.bench_function(&bench_id, |b| {
+                        let (cks, _cpu_sks) =
+                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                        let gpu_sks = CudaServerKey::new(&cks, &streams);
+
+                        b.iter(|| {
+                            _ = black_box(
+                                gpu_sks
+                                    .par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                                        Seed(0),
+                                        bit_size as u64,
+                                        num_block as u64,
+                                        &streams,
+                                    ),
+                            );
+                        })
+                    });
+                }
+                BenchmarkType::Throughput => {
+                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    let elements = throughput_num_threads(num_block);
+                    bench_group.throughput(Throughput::Elements(elements));
+
+                    bench_group.bench_function(&bench_id, |b| {
+                        let (cks, _cpu_sks) =
+                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                        let gpu_sks = CudaServerKey::new(&cks, &streams);
+
+                        b.iter(|| {
+                            (0..elements).into_par_iter().for_each(|i| {
+                                let selected_gpu =
+                                    streams.gpu_indexes[i as usize % streams.gpu_indexes.len()];
+                                let stream = CudaStreams::new_single_gpu(selected_gpu);
+                                gpu_sks
+                                    .par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                                        Seed(0),
+                                        bit_size as u64,
+                                        num_block as u64,
+                                        &stream,
+                                    );
+                            })
+                        })
+                    });
+                }
+            }
+
+            write_to_json::<u64, _>(
+                &bench_id,
+                param,
+                param.name(),
+                "oprf",
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus().0.ilog2(); num_block],
+            );
+        }
+
+        bench_group.finish()
+    }
+
     macro_rules! define_cuda_server_key_bench_clean_input_unary_fn (
         (method_name: $server_key_method:ident, display_name:$name:ident) => {
             ::paste::paste!{
@@ -2376,6 +2455,7 @@ mod cuda {
         cuda_trailing_zeros,
         cuda_trailing_ones,
         cuda_ilog2,
+        cuda_unsigned_oprf,
     );
 
     criterion_group!(
@@ -2395,6 +2475,7 @@ mod cuda {
         cuda_scalar_mul,
         cuda_scalar_div,
         cuda_scalar_rem,
+        cuda_unsigned_oprf,
     );
 
     criterion_group!(
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index e5c5271017..7ed226c3f3 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -481,6 +481,31 @@ pub unsafe fn add_lwe_ciphertext_vector_plaintext_vector_async<T: UnsignedIntege
     );
 }
 
+/// Addition of a vector of LWE ciphertexts with a plaintext scalar
+///
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+pub unsafe fn add_lwe_ciphertext_vector_plaintext_scalar_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    plaintext_in: u64,
+    lwe_dimension: LweDimension,
+    num_samples: u32,
+) {
+    cuda_add_lwe_ciphertext_vector_plaintext_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0].0,
+        lwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
+        plaintext_in,
+        lwe_dimension.0 as u32,
+        num_samples,
+    );
+}
+
 /// Assigned addition of a vector of LWE ciphertexts with a vector of plaintexts
 ///
 /// # Safety
diff --git a/tfhe/src/high_level_api/booleans/oprf.rs b/tfhe/src/high_level_api/booleans/oprf.rs
index 4546cfda84..942022010e 100644
--- a/tfhe/src/high_level_api/booleans/oprf.rs
+++ b/tfhe/src/high_level_api/booleans/oprf.rs
@@ -1,6 +1,12 @@
-use super::FheBool;
+use super::{FheBool, InnerBoolean};
 use crate::high_level_api::global_state;
+#[cfg(feature = "gpu")]
+use crate::high_level_api::global_state::with_thread_local_cuda_streams;
 use crate::high_level_api::keys::InternalServerKey;
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
 use crate::integer::BooleanBlock;
 use tfhe_csprng::seeders::Seed;
 
@@ -24,16 +30,28 @@ impl FheBool {
     /// let dec_result: bool = ct_res.decrypt(&client_key);
     /// ```
     pub fn generate_oblivious_pseudo_random(seed: Seed) -> Self {
-        global_state::with_internal_keys(|key| match key {
+        let (ciphertext, tag) = global_state::with_internal_keys(|key| match key {
             InternalServerKey::Cpu(key) => {
                 let ct = key.pbs_key().key.generate_oblivious_pseudo_random(seed, 1);
-
-                Self::new(BooleanBlock(ct), key.tag.clone())
+                (
+                    InnerBoolean::Cpu(BooleanBlock::new_unchecked(ct)),
+                    key.tag.clone(),
+                )
             }
             #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => {
-                todo!("Cuda devices do not yet support oblivious pseudo random generation")
-            }
-        })
+            InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
+                let d_ct: CudaUnsignedRadixCiphertext = cuda_key
+                    .key
+                    .key
+                    .generate_oblivious_pseudo_random(seed, 1, streams);
+                (
+                    InnerBoolean::Cuda(CudaBooleanBlock::from_cuda_radix_ciphertext(
+                        d_ct.ciphertext,
+                    )),
+                    cuda_key.tag.clone(),
+                )
+            }),
+        });
+        Self::new(ciphertext, tag)
     }
 }
diff --git a/tfhe/src/high_level_api/integers/oprf.rs b/tfhe/src/high_level_api/integers/oprf.rs
index 4ee3ec1049..09ee7f375a 100644
--- a/tfhe/src/high_level_api/integers/oprf.rs
+++ b/tfhe/src/high_level_api/integers/oprf.rs
@@ -1,8 +1,11 @@
-use super::{FheIntId, FheUintId};
+use super::{FheIntId, FheUint, FheUintId};
 use crate::high_level_api::global_state;
+#[cfg(feature = "gpu")]
+use crate::high_level_api::global_state::with_thread_local_cuda_streams;
 use crate::high_level_api::keys::InternalServerKey;
-use crate::{FheInt, FheUint, Seed};
-
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
+use crate::{FheInt, Seed};
 impl<Id: FheUintId> FheUint<Id> {
     /// Generates an encrypted unsigned integer
     /// taken uniformly in its full range using the given seed.
@@ -35,9 +38,18 @@ impl<Id: FheUintId> FheUint<Id> {
                 Self::new(ct, key.tag.clone())
             }
             #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => {
-                todo!("Cuda devices do not yet support oblivious pseudo random generation")
-            }
+            InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
+                let d_ct: CudaUnsignedRadixCiphertext = cuda_key
+                    .key
+                    .key
+                    .par_generate_oblivious_pseudo_random_unsigned_integer(
+                        seed,
+                        Id::num_blocks(cuda_key.message_modulus()) as u64,
+                        streams,
+                    );
+
+                Self::new(d_ct, cuda_key.tag.clone())
+            }),
         })
     }
     /// Generates an encrypted `num_block` blocks unsigned integer
@@ -75,9 +87,18 @@ impl<Id: FheUintId> FheUint<Id> {
                 Self::new(ct, key.tag.clone())
             }
             #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => {
-                todo!("Cuda devices do not yet support oblivious pseudo random generation")
-            }
+            InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
+                let d_ct: CudaUnsignedRadixCiphertext = cuda_key
+                    .key
+                    .key
+                    .par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                        seed,
+                        random_bits_count,
+                        Id::num_blocks(cuda_key.message_modulus()) as u64,
+                        streams,
+                    );
+                Self::new(d_ct, cuda_key.tag.clone())
+            }),
         })
     }
 }
@@ -115,9 +136,18 @@ impl<Id: FheIntId> FheInt<Id> {
                 Self::new(ct, key.tag.clone())
             }
             #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => {
-                todo!("Cuda devices do not yet support oblivious pseudo random generation")
-            }
+            InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
+                let d_ct: CudaSignedRadixCiphertext = cuda_key
+                    .key
+                    .key
+                    .par_generate_oblivious_pseudo_random_signed_integer(
+                        seed,
+                        Id::num_blocks(cuda_key.message_modulus()) as u64,
+                        streams,
+                    );
+
+                Self::new(d_ct, cuda_key.tag.clone())
+            }),
         })
     }
 
@@ -157,9 +187,18 @@ impl<Id: FheIntId> FheInt<Id> {
                 Self::new(ct, key.tag.clone())
             }
             #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => {
-                todo!("Cuda devices do not yet support oblivious pseudo random generation")
-            }
+            InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
+                let d_ct: CudaSignedRadixCiphertext = cuda_key
+                    .key
+                    .key
+                    .par_generate_oblivious_pseudo_random_signed_integer_bounded(
+                        seed,
+                        random_bits_count,
+                        Id::num_blocks(cuda_key.message_modulus()) as u64,
+                        streams,
+                    );
+                Self::new(d_ct, cuda_key.tag.clone())
+            }),
         })
     }
 }
diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs
index ccb49e3dfc..6c8031da2f 100644
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -20,7 +20,9 @@ use crate::integer::gpu::{
 };
 use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
-use crate::shortint::engine::{fill_accumulator, fill_many_lut_accumulator};
+use crate::shortint::engine::{
+    fill_accumulator, fill_accumulator_no_encoding, fill_many_lut_accumulator,
+};
 use crate::shortint::server_key::{
     BivariateLookupTableOwned, LookupTableOwned, ManyLookupTableOwned,
 };
@@ -36,6 +38,7 @@ mod even_odd;
 mod ilog2;
 mod mul;
 mod neg;
+mod oprf;
 mod rotate;
 mod scalar_add;
 mod scalar_bitwise_op;
@@ -805,6 +808,29 @@ impl CudaServerKey {
             degree: Degree::new(max_value),
         }
     }
+    pub(crate) fn generate_lookup_table_no_encode<F>(&self, f: F) -> LookupTableOwned
+    where
+        F: Fn(u64) -> u64,
+    {
+        let (glwe_size, polynomial_size) = match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                (d_bsk.glwe_dimension.to_glwe_size(), d_bsk.polynomial_size)
+            }
+            CudaBootstrappingKey::MultiBit(d_bsk) => {
+                (d_bsk.glwe_dimension.to_glwe_size(), d_bsk.polynomial_size)
+            }
+        };
+        let mut acc = GlweCiphertext::new(0, glwe_size, polynomial_size, self.ciphertext_modulus);
+
+        fill_accumulator_no_encoding(&mut acc, polynomial_size, glwe_size, f);
+
+        LookupTableOwned {
+            acc,
+            // We should not rely on the degree in this case
+            // The degree should be set manually on the outputs of PBS by this LUT
+            degree: Degree::new(self.message_modulus.0 * self.carry_modulus.0 * 2),
+        }
+    }
 
     pub fn generate_many_lookup_table(
         &self,
diff --git a/tfhe/src/integer/gpu/server_key/radix/oprf.rs b/tfhe/src/integer/gpu/server_key/radix/oprf.rs
new file mode 100644
index 0000000000..0ccc94cd99
--- /dev/null
+++ b/tfhe/src/integer/gpu/server_key/radix/oprf.rs
@@ -0,0 +1,736 @@
+use crate::core_crypto::gpu::CudaStreams;
+use crate::integer::gpu::ciphertext::{
+    CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
+};
+use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
+
+use crate::core_crypto::commons::generators::DeterministicSeeder;
+use crate::core_crypto::prelude::DefaultRandomGenerator;
+use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
+
+use crate::shortint::oprf::create_random_from_seed_modulus_switched;
+use crate::shortint::server_key::LookupTableOwned;
+
+pub use tfhe_csprng::seeders::{Seed, Seeder};
+
+use crate::core_crypto::gpu::{
+    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext,
+    cuda_programmable_bootstrap_lwe_ciphertext,
+};
+
+use crate::core_crypto::commons::numeric::Numeric;
+use crate::core_crypto::gpu::add_lwe_ciphertext_vector_plaintext_scalar_async;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::prelude::CastInto;
+use crate::integer::gpu::server_key::radix::{CudaLweCiphertextList, LweCiphertextCount};
+use crate::integer::gpu::CudaVec;
+use itertools::Itertools;
+impl CudaServerKey {
+    /// Generates an encrypted `num_block` blocks unsigned integer
+    /// taken uniformly in its full range using the given seed.
+    /// The encryted value is oblivious to the server.
+    /// It can be useful to make server random generation deterministic.
+    ///
+    /// ```rust
+    /// use tfhe::core_crypto::gpu::CudaStreams;
+    /// use tfhe::core_crypto::gpu::vec::GpuIndex;
+    /// use tfhe::integer::gpu::gen_keys_gpu;
+    /// use tfhe::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    /// use tfhe::Seed;
+    ///
+    /// let size = 4;
+    /// let gpu_index = 0;
+    /// let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
+    ///
+    /// // Generate the client key and the server key:
+    /// let (cks, sks) = gen_keys_gpu(PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, &streams);
+    ///
+    /// let d_ct_res = sks.par_generate_oblivious_pseudo_random_unsigned_integer(Seed(0), size as u64, &streams);
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&streams);
+    /// // Decrypt:
+    /// let dec_result: u64 = cks.decrypt_radix(&ct_res);
+    ///
+    /// assert!(dec_result < 1 << (2 * size));
+    /// ```
+    pub fn par_generate_oblivious_pseudo_random_unsigned_integer(
+        &self,
+        seed: Seed,
+        num_blocks: u64,
+        streams: &CudaStreams,
+    ) -> CudaUnsignedRadixCiphertext {
+        assert!(self.message_modulus.0.is_power_of_two());
+        let range_log_size = self.message_modulus.0.ilog2() as u64 * num_blocks;
+
+        let random_bits_count = range_log_size;
+
+        assert!(self.message_modulus.0.is_power_of_two());
+        let mut streams_vector = Vec::<CudaStreams>::with_capacity(num_blocks as usize);
+        for _ in 0..num_blocks {
+            streams_vector.push(CudaStreams::new_single_gpu(streams.gpu_indexes[0]));
+        }
+
+        let message_bits_count = self.message_modulus.0.ilog2() as u64;
+
+        let mut deterministic_seeder = DeterministicSeeder::<DefaultRandomGenerator>::new(seed);
+
+        let seeds: Vec<Seed> = (0..num_blocks)
+            .map(|_| deterministic_seeder.seed())
+            .collect();
+
+        let blocks = seeds
+            .into_par_iter()
+            .enumerate()
+            .map(|(i, seed)| {
+                let stream_index = i;
+                let i = i as u64;
+                if i * message_bits_count < random_bits_count {
+                    // if we generate 5 bits of noise in n blocks of 2 bits, the third (i=2) block
+                    // must have only one bit of random
+                    if random_bits_count < (i + 1) * message_bits_count {
+                        let top_message_bits_count = random_bits_count - i * message_bits_count;
+
+                        assert!(top_message_bits_count <= message_bits_count);
+                        let ct: CudaUnsignedRadixCiphertext = self
+                            .generate_oblivious_pseudo_random(
+                                seed,
+                                top_message_bits_count,
+                                &streams_vector[stream_index],
+                            );
+                        ct.ciphertext
+                    } else {
+                        let ct: CudaUnsignedRadixCiphertext = self
+                            .generate_oblivious_pseudo_random(
+                                seed,
+                                message_bits_count,
+                                &streams_vector[stream_index],
+                            );
+                        ct.ciphertext
+                    }
+                } else {
+                    let ct: CudaUnsignedRadixCiphertext =
+                        self.create_trivial_zero_radix(1, &streams_vector[stream_index]);
+                    ct.ciphertext
+                }
+            })
+            .collect::<Vec<_>>();
+        self.convert_radixes_vec_to_single_radix_ciphertext(&blocks, streams)
+    }
+
+    /// Generates an encrypted `num_block` blocks unsigned integer
+    /// taken uniformly in `[0, 2^random_bits_count[` using the given seed.
+    /// The encryted value is oblivious to the server.
+    /// It can be useful to make server random generation deterministic.
+    ///
+    /// ```rust
+    /// use tfhe::core_crypto::gpu::CudaStreams;
+    /// use tfhe::core_crypto::gpu::vec::GpuIndex;
+    /// use tfhe::integer::gpu::gen_keys_gpu;
+    /// use tfhe::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    /// use tfhe::Seed;
+    ///
+    /// let gpu_index = 0;
+    /// let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
+    /// let size = 4;
+    ///
+    /// // Generate the client key and the server key:
+    /// let (cks, sks) = gen_keys_gpu(PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, &streams);
+    ///
+    /// let random_bits_count = 3;
+    ///
+    /// let d_ct_res = sks.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+    ///     Seed(0),
+    ///     random_bits_count,
+    ///     size as u64,
+    ///     &streams,
+    /// );
+    /// let ct_res = d_ct_res.to_radix_ciphertext(&streams);
+    /// // Decrypt:
+    /// let dec_result: u64 = cks.decrypt(&ct_res);
+    /// assert!(dec_result < (1 << random_bits_count));
+    /// ```
+    pub fn par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+        &self,
+        seed: Seed,
+        random_bits_count: u64,
+        num_blocks: u64,
+        streams: &CudaStreams,
+    ) -> CudaUnsignedRadixCiphertext {
+        assert!(self.message_modulus.0.is_power_of_two());
+        let range_log_size = self.message_modulus.0.ilog2() as u64 * num_blocks;
+
+        assert!(
+            random_bits_count <= range_log_size,
+            "The range asked for a random value (=[0, 2^{random_bits_count}[) does not fit in the available range [0, 2^{range_log_size}[",
+        );
+
+        assert!(self.message_modulus.0.is_power_of_two());
+        let mut streams_vector = Vec::<CudaStreams>::with_capacity(num_blocks as usize);
+        for _ in 0..num_blocks {
+            streams_vector.push(CudaStreams::new_single_gpu(streams.gpu_indexes[0]));
+        }
+        let message_bits_count = self.message_modulus.0.ilog2() as u64;
+
+        let mut deterministic_seeder = DeterministicSeeder::<DefaultRandomGenerator>::new(seed);
+
+        let seeds: Vec<Seed> = (0..num_blocks)
+            .map(|_| deterministic_seeder.seed())
+            .collect();
+
+        let blocks = seeds
+            .into_par_iter()
+            .enumerate()
+            .map(|(i, seed)| {
+                let stream_index = i;
+                let i = i as u64;
+
+                if i * message_bits_count < random_bits_count {
+                    // if we generate 5 bits of noise in n blocks of 2 bits, the third (i=2) block
+                    // must have only one bit of random
+                    if random_bits_count < (i + 1) * message_bits_count {
+                        let top_message_bits_count = random_bits_count - i * message_bits_count;
+
+                        assert!(top_message_bits_count <= message_bits_count);
+
+                        let ct: CudaUnsignedRadixCiphertext = self
+                            .generate_oblivious_pseudo_random(
+                                seed,
+                                top_message_bits_count,
+                                &streams_vector[stream_index],
+                            );
+                        ct.ciphertext
+                    } else {
+                        let ct: CudaUnsignedRadixCiphertext = self
+                            .generate_oblivious_pseudo_random(
+                                seed,
+                                message_bits_count,
+                                &streams_vector[stream_index],
+                            );
+                        ct.ciphertext
+                    }
+                } else {
+                    let ct: CudaUnsignedRadixCiphertext =
+                        self.create_trivial_zero_radix(1, &streams_vector[stream_index]);
+                    ct.ciphertext
+                }
+            })
+            .collect::<Vec<_>>();
+        self.convert_radixes_vec_to_single_radix_ciphertext(&blocks, streams)
+    }
+
+    /// Generates an encrypted `num_block` blocks signed integer
+    /// taken uniformly in its full range using the given seed.
+    /// The encryted value is oblivious to the server.
+    /// It can be useful to make server random generation deterministic.
+    ///
+    /// ```rust
+    /// use tfhe::core_crypto::gpu::CudaStreams;
+    /// use tfhe::core_crypto::gpu::vec::GpuIndex;
+    /// use tfhe::integer::gpu::gen_keys_gpu;
+    /// use tfhe::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    /// use tfhe::Seed;
+    ///
+    /// let gpu_index = 0;
+    /// let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
+    /// let size = 4;
+    ///
+    /// // Generate the client key and the server key:
+    /// let (cks, sks) = gen_keys_gpu(PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, &streams);
+    ///
+    /// let d_ct_res = sks.par_generate_oblivious_pseudo_random_signed_integer(Seed(0), size as u64, &streams);
+    /// let ct_res = d_ct_res.to_signed_radix_ciphertext(&streams);
+    ///
+    /// // Decrypt:
+    /// let dec_result: i64 = cks.decrypt_signed_radix(&ct_res);
+    /// assert!(dec_result < 1 << (2 * size - 1));
+    /// assert!(dec_result >= -(1 << (2 * size - 1)));
+    /// ```
+    pub fn par_generate_oblivious_pseudo_random_signed_integer(
+        &self,
+        seed: Seed,
+        num_blocks: u64,
+        streams: &CudaStreams,
+    ) -> CudaSignedRadixCiphertext {
+        assert!(self.message_modulus.0.is_power_of_two());
+        let message_bits_count = self.message_modulus.0.ilog2() as u64;
+        let mut streams_vector = Vec::<CudaStreams>::with_capacity(num_blocks as usize);
+        for _ in 0..num_blocks {
+            streams_vector.push(CudaStreams::new_single_gpu(streams.gpu_indexes[0]));
+        }
+        let mut deterministic_seeder = DeterministicSeeder::<DefaultRandomGenerator>::new(seed);
+
+        let seeds: Vec<Seed> = (0..num_blocks)
+            .map(|_| deterministic_seeder.seed())
+            .collect();
+
+        let blocks = seeds
+            .into_par_iter()
+            .enumerate()
+            .map(|(i, seed)| {
+                let stream_index = i;
+                let ct: CudaSignedRadixCiphertext = self.generate_oblivious_pseudo_random(
+                    seed,
+                    message_bits_count,
+                    &streams_vector[stream_index],
+                );
+                ct.ciphertext
+            })
+            .collect::<Vec<_>>();
+        self.convert_radixes_vec_to_single_radix_ciphertext(&blocks, streams)
+    }
+
+    /// Generates an encrypted `num_block` blocks signed integer
+    /// taken uniformly in `[0, 2^random_bits_count[` using the given seed.
+    /// The encryted value is oblivious to the server.
+    /// It can be useful to make server random generation deterministic.
+    ///
+    /// ```rust
+    /// use tfhe::core_crypto::gpu::CudaStreams;
+    /// use tfhe::core_crypto::gpu::vec::GpuIndex;
+    /// use tfhe::integer::gpu::gen_keys_gpu;
+    /// use tfhe::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    /// use tfhe::Seed;
+    ///
+    /// let gpu_index = 0;
+    /// let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
+    /// let size = 4;
+    ///
+    /// // Generate the client key and the server key:
+    /// let (cks, sks) = gen_keys_gpu(PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, &streams);
+    ///
+    /// let random_bits_count = 3;
+    ///
+    /// let d_ct_res = sks.par_generate_oblivious_pseudo_random_signed_integer_bounded(
+    ///     Seed(0),
+    ///     random_bits_count,
+    ///     size as u64,
+    ///     &streams,
+    /// );
+    /// let ct_res = d_ct_res.to_signed_radix_ciphertext(&streams);
+    ///
+    /// // Decrypt:
+    /// let dec_result: i64 = cks.decrypt_signed_radix(&ct_res);
+    /// assert!(dec_result >= 0);
+    /// assert!(dec_result < (1 << random_bits_count));
+    /// ```
+    pub fn par_generate_oblivious_pseudo_random_signed_integer_bounded(
+        &self,
+        seed: Seed,
+        random_bits_count: u64,
+        num_blocks: u64,
+        streams: &CudaStreams,
+    ) -> CudaSignedRadixCiphertext {
+        assert!(self.message_modulus.0.is_power_of_two());
+        let range_log_size = self.message_modulus.0.ilog2() as u64 * num_blocks;
+
+        #[allow(clippy::int_plus_one)]
+        {
+            assert!(
+                random_bits_count + 1 <= range_log_size,
+                "The range asked for a random value (=[0, 2^{}[) does not fit in the available range [-2^{}, 2^{}[",
+                random_bits_count, range_log_size-1, range_log_size-1,
+            );
+        }
+
+        assert!(self.message_modulus.0.is_power_of_two());
+        let mut streams_vector = Vec::<CudaStreams>::with_capacity(num_blocks as usize);
+        for _ in 0..num_blocks {
+            streams_vector.push(CudaStreams::new_single_gpu(streams.gpu_indexes[0]));
+        }
+        let message_bits_count = self.message_modulus.0.ilog2() as u64;
+
+        let mut deterministic_seeder = DeterministicSeeder::<DefaultRandomGenerator>::new(seed);
+
+        let seeds = (0..num_blocks).map(|_| deterministic_seeder.seed());
+
+        let blocks = seeds
+            .into_iter()
+            .enumerate()
+            .map(|(i, seed)| {
+                let stream_index = i;
+                let i = i as u64;
+                if i * message_bits_count < random_bits_count {
+                    // if we generate 5 bits of noise in n blocks of 2 bits, the third (i=2)
+                    // block must have only one bit of random
+                    if random_bits_count < (i + 1) * message_bits_count {
+                        let top_message_bits_count = random_bits_count - i * message_bits_count;
+
+                        assert!(top_message_bits_count <= message_bits_count);
+                        let ct: CudaUnsignedRadixCiphertext = self
+                            .generate_oblivious_pseudo_random(
+                                seed,
+                                top_message_bits_count,
+                                &streams_vector[stream_index],
+                            );
+                        ct.ciphertext
+                    } else {
+                        let ct: CudaUnsignedRadixCiphertext = self
+                            .generate_oblivious_pseudo_random(
+                                seed,
+                                message_bits_count,
+                                &streams_vector[stream_index],
+                            );
+                        ct.ciphertext
+                    }
+                } else {
+                    let ct: CudaUnsignedRadixCiphertext =
+                        self.create_trivial_zero_radix(1, &streams_vector[stream_index]);
+                    ct.ciphertext
+                }
+            })
+            .collect::<Vec<_>>();
+
+        self.convert_radixes_vec_to_single_radix_ciphertext(&blocks, streams)
+    }
+
+    /// Uniformly generates a random encrypted value in `[0, 2^random_bits_count[`
+    /// `2^random_bits_count` must be smaller than the message modulus
+    /// The encryted value is oblivious to the server
+    pub fn generate_oblivious_pseudo_random<T>(
+        &self,
+        seed: Seed,
+        random_bits_count: u64,
+        streams: &CudaStreams,
+    ) -> T
+    where
+        T: CudaIntegerRadixCiphertext,
+    {
+        assert!(
+            1 << random_bits_count <= self.message_modulus.0,
+            "The range asked for a random value (=[0, 2^{}[) does not fit in the available range [0, {}[",
+            random_bits_count, self.message_modulus.0
+        );
+        self.generate_oblivious_pseudo_random_message_and_carry(seed, random_bits_count, streams)
+    }
+
+    /// Uniformly generates a random value in `[0, 2^random_bits_count[`
+    /// The encryted value is oblivious to the server
+    pub(crate) fn generate_oblivious_pseudo_random_message_and_carry<T>(
+        &self,
+        seed: Seed,
+        random_bits_count: u64,
+        streams: &CudaStreams,
+    ) -> T
+    where
+        T: CudaIntegerRadixCiphertext,
+    {
+        assert!(
+            self.message_modulus.0.is_power_of_two(),
+            "The message modulus(={}), must be a power of 2 to use the OPRF",
+            self.message_modulus.0
+        );
+        let message_bits_count = self.message_modulus.0.ilog2() as u64;
+
+        assert!(
+            self.carry_modulus.0.is_power_of_two(),
+            "The carry modulus(={}), must be a power of 2 to use the OPRF",
+            self.carry_modulus.0
+        );
+        let carry_bits_count = self.carry_modulus.0.ilog2() as u64;
+
+        assert!(
+            random_bits_count <= carry_bits_count + message_bits_count,
+            "The number of random bits asked for (={random_bits_count}) is bigger than carry_bits_count (={carry_bits_count}) + message_bits_count(={message_bits_count})",
+        );
+        self.generate_oblivious_pseudo_random_custom_encoding(
+            seed,
+            random_bits_count,
+            1 + carry_bits_count + message_bits_count,
+            streams,
+        )
+    }
+
+    /// Uniformly generates a random encrypted value in `[0, 2^random_bits_count[`
+    /// The output in in the form 0000rrr000noise (rbc=3, fbc=7)
+    /// The encryted value is oblivious to the server
+    pub(crate) fn generate_oblivious_pseudo_random_custom_encoding<T>(
+        &self,
+        seed: Seed,
+        random_bits_count: u64,
+        full_bits_count: u64,
+        streams: &CudaStreams,
+    ) -> T
+    where
+        T: CudaIntegerRadixCiphertext,
+    {
+        assert!(
+            random_bits_count <= full_bits_count,
+            "The number of random bits asked for (={random_bits_count}) is bigger than full_bits_count (={full_bits_count})"
+        );
+
+        let (in_lwe_size, out_lwe_dimension, polynomial_size) = match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => (
+                d_bsk.input_lwe_dimension().to_lwe_size(),
+                d_bsk.output_lwe_dimension(),
+                d_bsk.polynomial_size(),
+            ),
+            CudaBootstrappingKey::MultiBit(d_bsk) => (
+                d_bsk.input_lwe_dimension().to_lwe_size(),
+                d_bsk.output_lwe_dimension(),
+                d_bsk.polynomial_size(),
+            ),
+        };
+
+        let seeded = create_random_from_seed_modulus_switched(
+            seed,
+            in_lwe_size,
+            polynomial_size.to_blind_rotation_input_modulus_log(),
+            self.ciphertext_modulus,
+        );
+
+        let p = 1 << random_bits_count;
+
+        let delta = 1_u64 << (64 - full_bits_count);
+
+        let poly_delta = 2 * polynomial_size.0 as u64 / p;
+
+        let lut_no_encode: LookupTableOwned =
+            self.generate_lookup_table_no_encode(|x| (2 * (x / poly_delta) + 1) * delta / 2);
+
+        let num_ct_blocks = 1;
+        let ct_seeded = CudaLweCiphertextList::from_lwe_ciphertext(&seeded, streams);
+
+        let mut ct_out: T = self.create_trivial_zero_radix(num_ct_blocks, streams);
+
+        let number_of_messages = 1;
+        let d_accumulator =
+            CudaGlweCiphertextList::from_glwe_ciphertext(&lut_no_encode.acc, streams);
+        let mut lut_vector_indexes: Vec<u64> = vec![u64::ZERO; number_of_messages];
+        for (i, ind) in lut_vector_indexes.iter_mut().enumerate() {
+            *ind = <usize as CastInto<u64>>::cast_into(i);
+        }
+
+        let mut d_lut_vector_indexes =
+            unsafe { CudaVec::<u64>::new_async(number_of_messages, streams, 0) };
+        unsafe { d_lut_vector_indexes.copy_from_cpu_async(&lut_vector_indexes, streams, 0) };
+        let lwe_indexes_usize: Vec<usize> = (0..num_ct_blocks).collect_vec();
+        let lwe_indexes = lwe_indexes_usize
+            .iter()
+            .map(|&x| <usize as CastInto<u64>>::cast_into(x))
+            .collect_vec();
+        let mut d_output_indexes = unsafe { CudaVec::<u64>::new_async(num_ct_blocks, streams, 0) };
+        let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_ct_blocks, streams, 0) };
+        unsafe {
+            d_input_indexes.copy_from_cpu_async(&lwe_indexes, streams, 0);
+            d_output_indexes.copy_from_cpu_async(&lwe_indexes, streams, 0);
+        }
+
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_programmable_bootstrap_lwe_ciphertext(
+                    &ct_seeded,
+                    &mut ct_out.as_mut().d_blocks,
+                    &d_accumulator,
+                    &d_lut_vector_indexes,
+                    &d_output_indexes,
+                    &d_input_indexes,
+                    LweCiphertextCount(num_ct_blocks),
+                    d_bsk,
+                    streams,
+                );
+            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                cuda_multi_bit_programmable_bootstrap_lwe_ciphertext(
+                    &ct_seeded,
+                    &mut ct_out.as_mut().d_blocks,
+                    &d_accumulator,
+                    &d_lut_vector_indexes,
+                    &d_output_indexes,
+                    &d_input_indexes,
+                    d_multibit_bsk,
+                    streams,
+                );
+            }
+        }
+
+        let plaintext_to_add = (p - 1) * delta / 2;
+        let ct_cloned = ct_out.duplicate(streams);
+        unsafe {
+            add_lwe_ciphertext_vector_plaintext_scalar_async(
+                streams,
+                &mut ct_out.as_mut().d_blocks.0.d_vec,
+                &ct_cloned.as_ref().d_blocks.0.d_vec,
+                plaintext_to_add,
+                out_lwe_dimension,
+                num_ct_blocks as u32,
+            );
+        }
+        streams.synchronize();
+        ct_out
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod test {
+    use crate::core_crypto::gpu::vec::GpuIndex;
+    use crate::core_crypto::gpu::CudaStreams;
+    use crate::core_crypto::prelude::decrypt_lwe_ciphertext;
+    use crate::integer::gpu::server_key::radix::CudaUnsignedRadixCiphertext;
+    use crate::integer::gpu::server_key::CudaBootstrappingKey;
+    use crate::integer::gpu::{gen_keys_gpu, CudaServerKey};
+    use crate::integer::{ClientKey, RadixCiphertext};
+    use crate::shortint::oprf::create_random_from_seed_modulus_switched;
+    use crate::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    use rayon::prelude::*;
+    use statrs::distribution::ContinuousCDF;
+    use std::collections::HashMap;
+    use tfhe_csprng::seeders::Seed;
+
+    fn square(a: f64) -> f64 {
+        a * a
+    }
+
+    #[test]
+    fn oprf_compare_plain_ci_run_filter() {
+        let gpu_index = 0;
+        let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
+        let (ck, gpu_sk) = gen_keys_gpu(
+            PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
+            &streams,
+        );
+
+        for seed in 0..1000 {
+            oprf_compare_plain_from_seed(Seed(seed), &ck, &gpu_sk, &streams);
+        }
+    }
+
+    fn oprf_compare_plain_from_seed(
+        seed: Seed,
+        ck: &ClientKey,
+        sk: &CudaServerKey,
+        streams: &CudaStreams,
+    ) {
+        let params = ck.parameters();
+
+        let random_bits_count = 2;
+
+        let input_p = 2 * params.polynomial_size().0 as u64;
+
+        let log_input_p = input_p.ilog2();
+
+        let p_prime = 1 << random_bits_count;
+
+        let output_p = 2 * params.carry_modulus().0 * params.message_modulus().0;
+
+        let poly_delta = 2 * params.polynomial_size().0 as u64 / p_prime;
+
+        let d_img: CudaUnsignedRadixCiphertext =
+            sk.generate_oblivious_pseudo_random(seed, random_bits_count, streams);
+        let img: RadixCiphertext = d_img.to_radix_ciphertext(streams);
+
+        let (lwe_size, polynomial_size) = match &sk.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => (
+                d_bsk.input_lwe_dimension().to_lwe_size(),
+                d_bsk.polynomial_size(),
+            ),
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => (
+                d_multibit_bsk.input_lwe_dimension().to_lwe_size(),
+                d_multibit_bsk.polynomial_size(),
+            ),
+        };
+
+        let ct = create_random_from_seed_modulus_switched(
+            seed,
+            lwe_size,
+            polynomial_size.to_blind_rotation_input_modulus_log(),
+            sk.ciphertext_modulus,
+        );
+
+        let sk = ck.key.small_lwe_secret_key();
+        let plain_prf_input = decrypt_lwe_ciphertext(&sk, &ct)
+            .0
+            .wrapping_add(1 << (64 - log_input_p - 1))
+            >> (64 - log_input_p);
+
+        let half_negacyclic_part = |x| 2 * (x / poly_delta) + 1;
+
+        let negacyclic_part = |x| {
+            assert!(x < input_p);
+            if x < input_p / 2 {
+                half_negacyclic_part(x)
+            } else {
+                2 * output_p - half_negacyclic_part(x - (input_p / 2))
+            }
+        };
+
+        let prf = |x| {
+            let a = (negacyclic_part(x) + p_prime - 1) % (2 * output_p);
+            assert!(a % 2 == 0);
+            a / 2
+        };
+
+        let expected_output = prf(plain_prf_input);
+
+        let output = ck.key.decrypt_message_and_carry(&img.blocks[0]);
+
+        assert!(output < p_prime);
+        assert_eq!(output, expected_output);
+    }
+
+    #[test]
+    fn oprf_test_uniformity_ci_run_filter() {
+        let sample_count: usize = 100_000;
+
+        let p_value_limit: f64 = 0.000_01;
+        let gpu_index = 0;
+        let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
+        let (ck, gpu_sk) = gen_keys_gpu(
+            PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
+            &streams,
+        );
+
+        let test_uniformity = |distinct_values: u64, f: &(dyn Fn(usize) -> u64 + Sync)| {
+            test_uniformity(sample_count, p_value_limit, distinct_values, f)
+        };
+
+        let random_bits_count = 2;
+
+        test_uniformity(1 << random_bits_count, &|seed| {
+            let d_img: CudaUnsignedRadixCiphertext = gpu_sk.generate_oblivious_pseudo_random(
+                Seed(seed as u128),
+                random_bits_count,
+                &streams,
+            );
+            let img: RadixCiphertext = d_img.to_radix_ciphertext(&streams);
+            ck.key.decrypt_message_and_carry(&img.blocks[0])
+        });
+    }
+
+    pub fn test_uniformity<F>(sample_count: usize, p_value_limit: f64, distinct_values: u64, f: F)
+    where
+        F: Sync + Fn(usize) -> u64,
+    {
+        let p_value = uniformity_p_value(f, sample_count, distinct_values);
+
+        assert!(
+            p_value_limit < p_value,
+            "p_value (={p_value}) expected to be bigger than {p_value_limit}"
+        );
+    }
+
+    fn uniformity_p_value<F>(f: F, sample_count: usize, distinct_values: u64) -> f64
+    where
+        F: Sync + Fn(usize) -> u64,
+    {
+        let values: Vec<_> = (0..sample_count).into_par_iter().map(&f).collect();
+
+        let mut values_count = HashMap::new();
+
+        for i in &values {
+            assert!(*i < distinct_values, "i {} dv{}", *i, distinct_values);
+
+            *values_count.entry(i).or_insert(0) += 1;
+        }
+
+        let single_expected_count = sample_count as f64 / distinct_values as f64;
+
+        // https://en.wikipedia.org/wiki/Pearson's_chi-squared_test
+        let distance: f64 = (0..distinct_values)
+            .map(|value| *values_count.get(&value).unwrap_or(&0))
+            .map(|count| square(count as f64 - single_expected_count) / single_expected_count)
+            .sum();
+
+        statrs::distribution::ChiSquared::new((distinct_values - 1) as f64)
+            .unwrap()
+            .sf(distance)
+    }
+}
diff --git a/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs b/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs
index 3efb158321..a0433ebbe6 100644
--- a/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs
@@ -3,7 +3,9 @@ use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
-use crate::integer::gpu::server_key::radix::CudaRadixCiphertext;
+use crate::integer::gpu::server_key::radix::{
+    CudaBlockInfo, CudaRadixCiphertext, CudaRadixCiphertextInfo,
+};
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{apply_bivariate_lut_kb_async, PBSType};
 
@@ -23,9 +25,16 @@ impl CudaServerKey {
                 .map(|ciphertext| &ciphertext.as_ref().d_blocks),
             streams,
         );
+        let vec_block_info: Vec<CudaBlockInfo> = radixes
+            .iter()
+            .flat_map(|ct| ct.as_ref().info.blocks.clone())
+            .collect();
+        let radix_info = CudaRadixCiphertextInfo {
+            blocks: vec_block_info,
+        };
         CudaIntegerRadixCiphertext::from(CudaRadixCiphertext {
             d_blocks: packed_list,
-            info: radixes[0].as_ref().info.clone(),
+            info: radix_info,
         })
     }
 
diff --git a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
index 89fe57bd50..56ff9642b3 100644
--- a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
@@ -3,6 +3,7 @@ use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::{LweBskGroupingFactor, UnsignedInteger};
 use crate::integer::block_decomposition::{BlockDecomposer, Decomposable, DecomposableInto};
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
+use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
 use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
 use crate::integer::gpu::server_key::radix::CudaRadixCiphertext;
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
@@ -31,14 +32,17 @@ impl CudaServerKey {
                 .map(|ciphertext| &ciphertext.0.ciphertext.d_blocks),
             streams,
         );
-
-        let blocks_ct: CudaUnsignedRadixCiphertext = CudaUnsignedRadixCiphertext {
-            ciphertext: CudaRadixCiphertext {
-                d_blocks: packed_list,
-                info: selectors[0].0.ciphertext.info.clone(),
-            },
+        let vec_block_info: Vec<CudaBlockInfo> = selectors
+            .iter()
+            .flat_map(|ct| ct.0.ciphertext.info.blocks.clone())
+            .collect();
+        let radix_info = CudaRadixCiphertextInfo {
+            blocks: vec_block_info,
         };
-        blocks_ct
+        CudaIntegerRadixCiphertext::from(CudaRadixCiphertext {
+            d_blocks: packed_list,
+            info: radix_info,
+        })
     }
     #[allow(clippy::unused_self)]
     pub(crate) fn convert_radixes_vec_to_single_radix_ciphertext<T>(
@@ -56,10 +60,16 @@ impl CudaServerKey {
             radixes.iter().map(|ciphertext| &ciphertext.d_blocks),
             streams,
         );
-
+        let vec_block_info: Vec<CudaBlockInfo> = radixes
+            .iter()
+            .flat_map(|ct| ct.info.blocks.clone())
+            .collect();
+        let radix_info = CudaRadixCiphertextInfo {
+            blocks: vec_block_info,
+        };
         CudaIntegerRadixCiphertext::from(CudaRadixCiphertext {
             d_blocks: packed_list,
-            info: radixes[0].info.clone(),
+            info: radix_info,
         })
     }
 
diff --git a/tfhe/src/shortint/engine/mod.rs b/tfhe/src/shortint/engine/mod.rs
index 8986d85bc8..9d6dbac699 100644
--- a/tfhe/src/shortint/engine/mod.rs
+++ b/tfhe/src/shortint/engine/mod.rs
@@ -160,20 +160,15 @@ where
 
 pub(crate) fn fill_accumulator_no_encoding<F, C>(
     accumulator: &mut GlweCiphertext<C>,
-    server_key: &ServerKey,
+    polynomial_size: PolynomialSize,
+    glwe_size: GlweSize,
     f: F,
 ) where
     C: ContainerMut<Element = u64>,
     F: Fn(u64) -> u64,
 {
-    assert_eq!(
-        accumulator.polynomial_size(),
-        server_key.bootstrapping_key.polynomial_size()
-    );
-    assert_eq!(
-        accumulator.glwe_size(),
-        server_key.bootstrapping_key.glwe_size()
-    );
+    assert_eq!(accumulator.polynomial_size(), polynomial_size);
+    assert_eq!(accumulator.glwe_size(), glwe_size);
 
     let mut accumulator_view = accumulator.as_mut_view();
 
diff --git a/tfhe/src/shortint/oprf.rs b/tfhe/src/shortint/oprf.rs
index a55229622e..8484bcaef2 100644
--- a/tfhe/src/shortint/oprf.rs
+++ b/tfhe/src/shortint/oprf.rs
@@ -1,8 +1,8 @@
 use super::Ciphertext;
 use crate::core_crypto::fft_impl::common::modulus_switch;
 use crate::core_crypto::prelude::{
-    keyswitch_lwe_ciphertext, lwe_ciphertext_plaintext_add_assign, CiphertextModulusLog,
-    LweCiphertext, LweSize, Plaintext,
+    keyswitch_lwe_ciphertext, lwe_ciphertext_plaintext_add_assign, CiphertextModulus,
+    CiphertextModulusLog, LweCiphertext, LweSize, Plaintext,
 };
 use crate::shortint::ciphertext::Degree;
 use crate::shortint::engine::ShortintEngine;
@@ -28,35 +28,33 @@ pub fn sha3_hash(values: &mut [u64], seed: Seed) {
         *value = u64::from_le_bytes(bytes);
     }
 }
+pub fn create_random_from_seed(
+    seed: Seed,
+    lwe_size: LweSize,
+    ciphertext_modulus: CiphertextModulus<u64>,
+) -> LweCiphertext<Vec<u64>> {
+    let mut ct = LweCiphertext::new(0, lwe_size, ciphertext_modulus);
 
-impl ServerKey {
-    pub(crate) fn create_random_from_seed(
-        &self,
-        seed: Seed,
-        lwe_size: LweSize,
-    ) -> LweCiphertext<Vec<u64>> {
-        let mut ct = LweCiphertext::new(0, lwe_size, self.ciphertext_modulus);
-
-        sha3_hash(ct.get_mut_mask().as_mut(), seed);
-
-        ct
-    }
+    sha3_hash(ct.get_mut_mask().as_mut(), seed);
 
-    pub(crate) fn create_random_from_seed_modulus_switched(
-        &self,
-        seed: Seed,
-        lwe_size: LweSize,
-        log_modulus: CiphertextModulusLog,
-    ) -> LweCiphertext<Vec<u64>> {
-        let mut ct = self.create_random_from_seed(seed, lwe_size);
+    ct
+}
 
-        for i in ct.as_mut() {
-            *i = modulus_switch(*i, log_modulus) << (64 - log_modulus.0);
-        }
+pub fn create_random_from_seed_modulus_switched(
+    seed: Seed,
+    lwe_size: LweSize,
+    log_modulus: CiphertextModulusLog,
+    ciphertext_modulus: CiphertextModulus<u64>,
+) -> LweCiphertext<Vec<u64>> {
+    let mut ct = create_random_from_seed(seed, lwe_size, ciphertext_modulus);
 
-        ct
+    for i in ct.as_mut() {
+        *i = modulus_switch(*i, log_modulus) << (64 - log_modulus.0);
     }
 
+    ct
+}
+impl ServerKey {
     /// Uniformly generates a random encrypted value in `[0, 2^random_bits_count[`
     /// `2^random_bits_count` must be smaller than the message modulus
     /// The encryted value is oblivious to the server
@@ -123,12 +121,13 @@ impl ServerKey {
 
         let in_lwe_size = self.bootstrapping_key.input_lwe_dimension().to_lwe_size();
 
-        let seeded = self.create_random_from_seed_modulus_switched(
+        let seeded = create_random_from_seed_modulus_switched(
             seed,
             in_lwe_size,
             self.bootstrapping_key
                 .polynomial_size()
                 .to_blind_rotation_input_modulus_log(),
+            self.ciphertext_modulus,
         );
 
         let p = 1 << random_bits_count;
@@ -183,6 +182,7 @@ impl ServerKey {
 #[cfg(test)]
 pub(crate) mod test {
     use crate::core_crypto::prelude::decrypt_lwe_ciphertext;
+    use crate::shortint::oprf::create_random_from_seed_modulus_switched;
     use crate::shortint::{ClientKey, ServerKey};
     use rayon::prelude::*;
     use statrs::distribution::ContinuousCDF;
@@ -223,12 +223,13 @@ pub(crate) mod test {
 
         let lwe_size = sk.bootstrapping_key.input_lwe_dimension().to_lwe_size();
 
-        let ct = sk.create_random_from_seed_modulus_switched(
+        let ct = create_random_from_seed_modulus_switched(
             seed,
             lwe_size,
             sk.bootstrapping_key
                 .polynomial_size()
                 .to_blind_rotation_input_modulus_log(),
+            sk.ciphertext_modulus,
         );
 
         let sk = ck.small_lwe_secret_key();
diff --git a/tfhe/src/shortint/server_key/mod.rs b/tfhe/src/shortint/server_key/mod.rs
index 0fc59daeb2..ed82e2fce6 100644
--- a/tfhe/src/shortint/server_key/mod.rs
+++ b/tfhe/src/shortint/server_key/mod.rs
@@ -640,7 +640,12 @@ impl ServerKey {
             self.bootstrapping_key.polynomial_size(),
             self.ciphertext_modulus,
         );
-        fill_accumulator_no_encoding(&mut acc, self, f);
+        fill_accumulator_no_encoding(
+            &mut acc,
+            self.bootstrapping_key.polynomial_size(),
+            self.bootstrapping_key.glwe_size(),
+            f,
+        );
 
         LookupTableOwned {
             acc,