From 01a1ff1cb92b730bcb70a184f1b21115d9fb269a Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 16 Jan 2025 18:17:10 +0100
Subject: [PATCH] chore(gpu): start using a struct to pass data across rust/c++

---
 .../tfhe-cuda-backend/cuda/include/device.h   |   3 +
 .../cuda/include/integer/integer.h            |  21 +-
 .../cuda/include/integer/integer_utilities.h  |  47 +--
 .../cuda/include/integer/radix_ciphertext.h   |  31 ++
 .../cuda/include/linear_algebra.h             |  16 +-
 backends/tfhe-cuda-backend/cuda/src/device.cu |   7 +
 .../tfhe-cuda-backend/cuda/src/integer/abs.cu |   9 +-
 .../cuda/src/integer/abs.cuh                  |  60 +++-
 .../cuda/src/integer/cmux.cu                  |  19 +-
 .../cuda/src/integer/cmux.cuh                 |  91 +++++-
 .../cuda/src/integer/comparison.cuh           |   2 +-
 .../cuda/src/integer/div_rem.cuh              |  36 +--
 .../cuda/src/integer/integer.cuh              | 153 ++++------
 .../cuda/src/integer/multiplication.cuh       |  12 +-
 .../cuda/src/integer/radix_ciphertext.cu      |  87 ++++++
 .../cuda/src/integer/scalar_comparison.cuh    |  10 +-
 .../cuda/src/integer/shift_and_rotate.cuh     |  12 +-
 .../cuda/src/linearalgebra/addition.cu        |  29 +-
 .../cuda/src/linearalgebra/addition.cuh       |  39 ++-
 backends/tfhe-cuda-backend/src/bindings.rs    | 277 ++++++++++--------
 tfhe/src/core_crypto/gpu/mod.rs               |  67 ++++-
 tfhe/src/integer/gpu/ciphertext/info.rs       |  17 --
 tfhe/src/integer/gpu/mod.rs                   | 128 ++++++--
 tfhe/src/integer/gpu/server_key/radix/abs.rs  | 122 ++++----
 tfhe/src/integer/gpu/server_key/radix/add.rs  |  55 ++--
 .../gpu/server_key/radix/bitwise_op.rs        |  26 +-
 tfhe/src/integer/gpu/server_key/radix/cmux.rs | 168 +++++------
 .../gpu/server_key/radix/comparison.rs        |  64 ++--
 .../integer/gpu/server_key/radix/div_mod.rs   |  40 ++-
 .../src/integer/gpu/server_key/radix/ilog2.rs |  12 +-
 tfhe/src/integer/gpu/server_key/radix/mod.rs  |  97 +++---
 tfhe/src/integer/gpu/server_key/radix/mul.rs  |   8 +-
 tfhe/src/integer/gpu/server_key/radix/neg.rs  |   2 +-
 .../integer/gpu/server_key/radix/rotate.rs    |  32 +-
 .../gpu/server_key/radix/scalar_add.rs        |  28 +-
 .../gpu/server_key/radix/scalar_bitwise_op.rs |   6 +-
 .../gpu/server_key/radix/scalar_comparison.rs |  16 +-
 .../gpu/server_key/radix/scalar_div_mod.rs    |  12 +-
 .../gpu/server_key/radix/scalar_mul.rs        |   2 +-
 .../gpu/server_key/radix/scalar_rotate.rs     |  12 +-
 .../gpu/server_key/radix/scalar_shift.rs      |  20 +-
 .../gpu/server_key/radix/scalar_sub.rs        |   4 +-
 .../src/integer/gpu/server_key/radix/shift.rs |  32 +-
 tfhe/src/integer/gpu/server_key/radix/sub.rs  |  26 +-
 .../server_key/radix/vector_comparisons.rs    |  24 +-
 .../gpu/server_key/radix/vector_find.rs       |  90 ++----
 .../radix_parallel/tests_long_run/mod.rs      |   2 +-
 tfhe/src/shortint/ciphertext/common.rs        |   4 +-
 48 files changed, 1152 insertions(+), 925 deletions(-)
 create mode 100644 backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
 create mode 100644 backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu

diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
index 431b725360..6af6472693 100644
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -42,6 +42,9 @@ void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);
 
 void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
 
+void synchronize_streams(cudaStream_t const *streams,
+                         uint32_t const *gpu_indexes, uint32_t gpu_count);
+
 uint32_t cuda_is_available();
 
 void *cuda_malloc(uint64_t size, uint32_t gpu_index);
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
index 68d6cb02dc..f101c381f8 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -38,6 +38,15 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
 enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };
 
 extern "C" {
+
+typedef struct {
+  void *ptr;
+  uint64_t *degrees;
+  uint64_t *noise_levels;
+  uint32_t num_radix_blocks;
+  uint32_t lwe_dimension;
+} CudaRadixCiphertextFFI;
+
 void scratch_cuda_apply_univariate_lut_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -258,9 +267,11 @@ void scratch_cuda_integer_radix_cmux_kb_64(
 
 void cuda_cmux_integer_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
-    void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_ciphertext_count);
+    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_condition,
+    CudaRadixCiphertextFFI const *lwe_array_true,
+    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks);
 
 void cleanup_cuda_integer_radix_cmux(void *const *streams,
                                      uint32_t const *gpu_indexes,
@@ -439,8 +450,8 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
 
 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks);
+    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
+    void *const *bsks, void *const *ksks);
 
 void cleanup_cuda_integer_abs_inplace(void *const *streams,
                                       uint32_t const *gpu_indexes,
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
index 186b8c7ce2..b20c8fc49e 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -4,6 +4,7 @@
 #include "integer.h"
 #include "keyswitch.h"
 #include "pbs/programmable_bootstrap.cuh"
+#include "radix_ciphertext.h"
 #include <cassert>
 #include <cmath>
 #include <functional>
@@ -2963,9 +2964,9 @@ template <typename Torus> struct int_cmux_buffer {
   int_radix_lut<Torus> *predicate_lut;
   int_radix_lut<Torus> *message_extract_lut;
 
-  Torus *buffer_in;
-  Torus *buffer_out;
-  Torus *condition_array;
+  CudaRadixCiphertextFFI *buffer_in = new CudaRadixCiphertextFFI;
+  CudaRadixCiphertextFFI *buffer_out = new CudaRadixCiphertextFFI;
+  CudaRadixCiphertextFFI *condition_array = new CudaRadixCiphertextFFI;
 
   int_radix_params params;
 
@@ -2978,15 +2979,18 @@ template <typename Torus> struct int_cmux_buffer {
     this->params = params;
 
     if (allocate_gpu_memory) {
-      Torus big_size =
-          (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
-
-      buffer_in =
-          (Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
-      buffer_out =
-          (Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
-      condition_array =
-          (Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
+      printf("Here\n");
+      create_trivial_radix_ciphertext_async<Torus>(
+          streams[0], gpu_indexes[0], buffer_in, 2 * num_radix_blocks,
+          params.big_lwe_dimension);
+      printf("Here\n");
+      create_trivial_radix_ciphertext_async<Torus>(
+          streams[0], gpu_indexes[0], buffer_out, 2 * num_radix_blocks,
+          params.big_lwe_dimension);
+      printf("Here\n");
+      create_trivial_radix_ciphertext_async<Torus>(
+          streams[0], gpu_indexes[0], condition_array, 2 * num_radix_blocks,
+          params.big_lwe_dimension);
 
       auto lut_f = [predicate_lut_f](Torus block, Torus condition) -> Torus {
         return predicate_lut_f(condition) ? 0 : block;
@@ -3047,9 +3051,12 @@ template <typename Torus> struct int_cmux_buffer {
     message_extract_lut->release(streams, gpu_indexes, gpu_count);
     delete message_extract_lut;
 
-    cuda_drop_async(buffer_in, streams[0], gpu_indexes[0]);
-    cuda_drop_async(buffer_out, streams[0], gpu_indexes[0]);
-    cuda_drop_async(condition_array, streams[0], gpu_indexes[0]);
+    release_radix_ciphertext_data(streams[0], gpu_indexes[0], buffer_in);
+    delete buffer_in;
+    release_radix_ciphertext_data(streams[0], gpu_indexes[0], buffer_out);
+    delete buffer_out;
+    release_radix_ciphertext_data(streams[0], gpu_indexes[0], condition_array);
+    delete condition_array;
   }
 };
 
@@ -4351,7 +4358,7 @@ template <typename Torus> struct int_abs_buffer {
   int_sc_prop_memory<Torus> *scp_mem;
   int_bitop_buffer<Torus> *bitxor_mem;
 
-  Torus *mask;
+  CudaRadixCiphertextFFI *mask;
   int_abs_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                  uint32_t gpu_count, int_radix_params params,
                  uint32_t num_radix_blocks, bool allocate_gpu_memory) {
@@ -4372,11 +4379,9 @@ template <typename Torus> struct int_abs_buffer {
           streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params,
           num_radix_blocks, allocate_gpu_memory);
 
-      uint32_t lwe_size = params.big_lwe_dimension + 1;
-      uint32_t lwe_size_bytes = lwe_size * sizeof(Torus);
-
-      mask = (Torus *)cuda_malloc_async(num_radix_blocks * lwe_size_bytes,
-                                        streams[0], gpu_indexes[0]);
+      create_trivial_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                                   mask, num_radix_blocks,
+                                                   params.big_lwe_dimension);
     }
   }
 
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h b/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
new file mode 100644
index 0000000000..f42a06b350
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
@@ -0,0 +1,31 @@
+#ifndef CUDA_RADIX_CIPHERTEXT_H
+#define CUDA_RADIX_CIPHERTEXT_H
+
+#include "device.h"
+#include "integer.h"
+
+template <typename Torus>
+void create_trivial_radix_ciphertext_async(cudaStream_t const stream,
+                                           uint32_t const gpu_index,
+                                           CudaRadixCiphertextFFI *output_radix,
+                                           uint32_t num_radix_blocks,
+                                           uint32_t lwe_dimension);
+
+// end_lwe_index is inclusive
+template <typename Torus>
+void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
+                               const CudaRadixCiphertextFFI *input_radix,
+                               uint32_t start_lwe_index,
+                               uint32_t end_lwe_index);
+
+template <typename Torus>
+void copy_radix_ciphertext_to_larger_output_slice_async(
+    cudaStream_t const stream, uint32_t const gpu_index,
+    CudaRadixCiphertextFFI *output_radix,
+    const CudaRadixCiphertextFFI *input_radix, uint32_t output_start_lwe_index);
+
+void release_radix_ciphertext_data(cudaStream_t const stream,
+                                   uint32_t const gpu_index,
+                                   CudaRadixCiphertextFFI *data);
+
+#endif // CUDA_RADIX_CIPHERTEXT_H
diff --git a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
index 886a9befd4..7f9ff3164d 100644
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -1,6 +1,7 @@
 #ifndef CUDA_LINALG_H_
 #define CUDA_LINALG_H_
 
+#include "integer/integer.h"
 #include <stdint.h>
 
 extern "C" {
@@ -14,16 +15,13 @@ void cuda_negate_lwe_ciphertext_vector_64(
     void const *lwe_array_in, const uint32_t input_lwe_dimension,
     const uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_32(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in_1, void const *lwe_array_in_2,
-    const uint32_t input_lwe_dimension,
-    const uint32_t input_lwe_ciphertext_count);
+    void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in_1,
+    CudaRadixCiphertextFFI const *lwe_array_in_2);
 void cuda_add_lwe_ciphertext_vector_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in_1, void const *lwe_array_in_2,
-    const uint32_t input_lwe_dimension,
-    const uint32_t input_lwe_ciphertext_count);
-
+    void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in_1,
+    CudaRadixCiphertextFFI const *lwe_array_in_2);
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_array_in, void const *plaintext_array_in,
diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
index 041e228b3d..78ce8fa96c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -45,6 +45,13 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
   check_cuda_error(cudaStreamSynchronize(stream));
 }
 
+void synchronize_streams(cudaStream_t const *streams,
+                         uint32_t const *gpu_indexes, uint32_t gpu_count) {
+  for (uint i = 0; i < gpu_count; i++) {
+    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+  }
+}
+
 // Determine if a CUDA device is available at runtime
 uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
index 1f462753ee..821015f251 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -22,15 +22,14 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
 
 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
-    void *const *ksks, uint32_t num_blocks) {
+    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
+    void *const *bsks, void *const *ksks) {
 
   auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
 
   host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
-                                gpu_count, static_cast<uint64_t *>(ct), bsks,
-                                (uint64_t **)(ksks), mem, is_signed,
-                                num_blocks);
+                                gpu_count, ct, bsks, (uint64_t **)(ksks), mem,
+                                is_signed);
 }
 
 void cleanup_cuda_integer_abs_inplace(void *const *streams,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
index d9053bbfbd..a60183b69d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -8,6 +8,7 @@
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
 #include "integer/negation.cuh"
+#include "integer/radix_ciphertext.h"
 #include "integer/scalar_shifts.cuh"
 #include "linear_algebra.h"
 #include "pbs/programmable_bootstrap.h"
@@ -32,16 +33,15 @@ __host__ void scratch_cuda_integer_abs_kb(
 }
 
 template <typename Torus>
-__host__ void
-host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                    uint32_t gpu_count, Torus *ct, void *const *bsks,
-                    uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
-                    bool is_signed, uint32_t num_blocks) {
+__host__ void legacy_host_integer_abs_kb_async(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *ct, void *const *bsks, uint64_t *const *ksks,
+    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed, uint32_t num_blocks) {
   if (!is_signed)
     return;
 
   auto radix_params = mem_ptr->params;
-  auto mask = mem_ptr->mask;
+  auto mask = (Torus *)(mem_ptr->mask->ptr);
 
   auto big_lwe_dimension = radix_params.big_lwe_dimension;
   auto big_lwe_size = big_lwe_dimension + 1;
@@ -52,11 +52,11 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
   cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
                                streams[0], gpu_indexes[0]);
 
-  host_integer_radix_arithmetic_scalar_shift_kb_inplace(
+  host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
       streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
       mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
-  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
-                       radix_params.big_lwe_dimension, num_blocks);
+  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
+                              radix_params.big_lwe_dimension, num_blocks);
 
   uint32_t requested_flag = outputFlag::FLAG_NONE;
   uint32_t uses_carry = 0;
@@ -64,8 +64,46 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
       streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
       bsks, ksks, num_blocks, requested_flag, uses_carry);
 
-  host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
-                              mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
+  host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
+                                     ct, mem_ptr->bitxor_mem, bsks, ksks,
+                                     num_blocks);
+}
+
+template <typename Torus>
+__host__ void
+host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
+                    void *const *bsks, uint64_t *const *ksks,
+                    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
+  if (!is_signed)
+    return;
+
+  auto mask = mem_ptr->mask;
+
+  uint32_t num_bits_in_ciphertext =
+      (31 - __builtin_clz(mem_ptr->params.message_modulus)) *
+      ct->num_radix_blocks;
+
+  copy_radix_ciphertext_to_larger_output_slice_async<Torus>(
+      streams[0], gpu_indexes[0], mask, ct, 0);
+
+  host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, (Torus *)(mask->ptr),
+      num_bits_in_ciphertext - 1, mem_ptr->arithmetic_scalar_shift_mem, bsks,
+      ksks, ct->num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct);
+
+  uint32_t requested_flag = outputFlag::FLAG_NONE;
+  uint32_t uses_carry = 0;
+  host_propagate_single_carry<Torus>(
+      streams, gpu_indexes, gpu_count, (Torus *)(ct->ptr), nullptr, nullptr,
+      mem_ptr->scp_mem, bsks, ksks, ct->num_radix_blocks, requested_flag,
+      uses_carry);
+
+  host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                     (Torus *)(ct->ptr), (Torus *)(mask->ptr),
+                                     (Torus *)(ct->ptr), mem_ptr->bitxor_mem,
+                                     bsks, ksks, ct->num_radix_blocks);
 }
 
 #endif // TFHE_RS_ABS_CUH
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
index bf2e7eeafe..e51eed1170 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -25,19 +25,16 @@ void scratch_cuda_integer_radix_cmux_kb_64(
 
 void cuda_cmux_integer_radix_ciphertext_kb_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
-    void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t lwe_ciphertext_count) {
+    CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_condition,
+    CudaRadixCiphertextFFI const *lwe_array_true,
+    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks) {
 
   host_integer_radix_cmux_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_condition),
-      static_cast<const uint64_t *>(lwe_array_true),
-      static_cast<const uint64_t *>(lwe_array_false),
-      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-
-      lwe_ciphertext_count);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
+      lwe_condition, lwe_array_true, lwe_array_false,
+      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
 }
 
 void cleanup_cuda_integer_radix_cmux(void *const *streams,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
index b74bdfbe91..cc827d2499 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -1,6 +1,7 @@
 #ifndef CUDA_INTEGER_CMUX_CUH
 #define CUDA_INTEGER_CMUX_CUH
 
+#include "../include/integer/radix_ciphertext.h"
 #include "integer.cuh"
 
 template <typename Torus>
@@ -29,7 +30,7 @@ __host__ void zero_out_if(cudaStream_t const *streams,
 }
 
 template <typename Torus>
-__host__ void host_integer_radix_cmux_kb(
+__host__ void legacy_host_integer_radix_cmux_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition,
     Torus const *lwe_array_true, Torus const *lwe_array_false,
@@ -39,36 +40,96 @@ __host__ void host_integer_radix_cmux_kb(
   auto params = mem_ptr->params;
   Torus lwe_size = params.big_lwe_dimension + 1;
   Torus radix_lwe_size = lwe_size * num_radix_blocks;
-  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in, lwe_array_true,
+  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in->ptr, lwe_array_true,
                                radix_lwe_size * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in + radix_lwe_size,
-                               lwe_array_false, radix_lwe_size * sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
+  cuda_memcpy_async_gpu_to_gpu(
+      (Torus *)(mem_ptr->buffer_in->ptr) + radix_lwe_size, lwe_array_false,
+      radix_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
   for (uint i = 0; i < 2 * num_radix_blocks; i++) {
-    cuda_memcpy_async_gpu_to_gpu(mem_ptr->condition_array + i * lwe_size,
-                                 lwe_condition, lwe_size * sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(
+        (Torus *)(mem_ptr->condition_array->ptr) + i * lwe_size, lwe_condition,
+        lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
   }
   integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
-      mem_ptr->condition_array, bsks, ksks, 2 * num_radix_blocks,
-      mem_ptr->predicate_lut, params.message_modulus);
+      streams, gpu_indexes, gpu_count, (Torus *)(mem_ptr->buffer_out->ptr),
+      (Torus *)(mem_ptr->buffer_in->ptr),
+      (Torus *)(mem_ptr->condition_array->ptr), bsks, ksks,
+      2 * num_radix_blocks, mem_ptr->predicate_lut, params.message_modulus);
 
   // If the condition was true, true_ct will have kept its value and false_ct
   // will be 0 If the condition was false, true_ct will be 0 and false_ct will
   // have kept its value
-  auto mem_true = mem_ptr->buffer_out;
-  auto mem_false = &mem_ptr->buffer_out[radix_lwe_size];
+  auto mem_true = (Torus *)(mem_ptr->buffer_out->ptr);
+  auto ptr = (Torus *)mem_ptr->buffer_out->ptr;
+  auto mem_false = &ptr[radix_lwe_size];
   auto added_cts = mem_true;
-  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
-                       mem_false, params.big_lwe_dimension, num_radix_blocks);
+  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
+                              mem_false, params.big_lwe_dimension,
+                              num_radix_blocks);
 
   integer_radix_apply_univariate_lookup_table_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
       num_radix_blocks, mem_ptr->message_extract_lut);
 }
 
+template <typename Torus>
+__host__ void host_integer_radix_cmux_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_condition,
+    CudaRadixCiphertextFFI const *lwe_array_true,
+    CudaRadixCiphertextFFI const *lwe_array_false,
+    int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
+
+  if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
+    PANIC("Cuda error: input and output num radix blocks must be the same")
+  if (lwe_array_out->num_radix_blocks != lwe_array_false->num_radix_blocks)
+    PANIC("Cuda error: input and output num radix blocks must be the same")
+
+  auto num_radix_blocks = lwe_array_out->num_radix_blocks;
+  auto params = mem_ptr->params;
+  Torus lwe_size = params.big_lwe_dimension + 1;
+  Torus radix_lwe_size = lwe_size * num_radix_blocks;
+  copy_radix_ciphertext_to_larger_output_slice_async<Torus>(
+      streams[0], gpu_indexes[0], mem_ptr->buffer_in, lwe_array_true, 0);
+  copy_radix_ciphertext_to_larger_output_slice_async<Torus>(
+      streams[0], gpu_indexes[0], mem_ptr->buffer_in, lwe_array_false,
+      num_radix_blocks);
+  for (uint i = 0; i < 2 * num_radix_blocks; i++) {
+    cuda_memcpy_async_gpu_to_gpu(
+        (Torus *)(mem_ptr->condition_array->ptr) + i * lwe_size,
+        (Torus *)(lwe_condition->ptr), lwe_size * sizeof(Torus), streams[0],
+        gpu_indexes[0]);
+  }
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, (Torus *)(mem_ptr->buffer_out->ptr),
+      (Torus *)(mem_ptr->buffer_in->ptr),
+      (Torus *)(mem_ptr->condition_array->ptr), bsks, ksks,
+      2 * num_radix_blocks, mem_ptr->predicate_lut, params.message_modulus);
+
+  // If the condition was true, true_ct will have kept its value and false_ct
+  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
+  // have kept its value
+  CudaRadixCiphertextFFI *mem_true = new CudaRadixCiphertextFFI;
+  CudaRadixCiphertextFFI *mem_false = new CudaRadixCiphertextFFI;
+  as_radix_ciphertext_slice<Torus>(mem_true, mem_ptr->buffer_out, 0,
+                                   num_radix_blocks - 1);
+  as_radix_ciphertext_slice<Torus>(mem_false, mem_ptr->buffer_out,
+                                   num_radix_blocks, 2 * num_radix_blocks - 1);
+
+  auto added_cts = mem_true;
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
+                       mem_false);
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, (Torus *)(lwe_array_out->ptr),
+      (Torus *)(added_cts->ptr), bsks, ksks, num_radix_blocks,
+      mem_ptr->message_extract_lut);
+  delete mem_true;
+  delete mem_false;
+}
+
 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_cmux_kb(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
index 3f82be99d6..ed311f6389 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -620,7 +620,7 @@ __host__ void host_integer_radix_maxmin_kb(
       ksks, total_num_radix_blocks);
 
   // Selector
-  host_integer_radix_cmux_kb<Torus>(
+  legacy_host_integer_radix_cmux_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out,
       mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
       mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
index ab574705f6..b080c5edf7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -402,7 +402,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
     // but in that position, interesting_remainder2 always has a 0
     auto &merged_interesting_remainder = interesting_remainder1;
 
-    host_addition<Torus>(
+    legacy_host_addition<Torus>(
         streams[0], gpu_indexes[0], merged_interesting_remainder.data,
         merged_interesting_remainder.data, interesting_remainder2.data,
         radix_params.big_lwe_dimension, merged_interesting_remainder.len);
@@ -507,10 +507,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
       cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
     }
 
-    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
-                         subtraction_overflowed.data,
-                         at_least_one_upper_block_is_non_zero.data,
-                         radix_params.big_lwe_dimension, 1);
+    legacy_host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
+                                subtraction_overflowed.data,
+                                at_least_one_upper_block_is_non_zero.data,
+                                radix_params.big_lwe_dimension, 1);
 
     int factor = (i) ? 3 : 2;
     int factor_lut_id = factor - 2;
@@ -552,7 +552,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
           mem_ptr->merge_overflow_flags_luts[pos_in_block]
               ->params.message_modulus);
 
-      host_addition<Torus>(
+      legacy_host_addition<Torus>(
           streams[0], gpu_indexes[0], &quotient[block_of_bit * big_lwe_size],
           &quotient[block_of_bit * big_lwe_size], did_not_overflow.data,
           radix_params.big_lwe_dimension, 1);
@@ -588,9 +588,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
 
   // Clean the quotient and remainder
   // as even though they have no carries, they are not at nominal noise level
-  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
-                       remainder2.data, radix_params.big_lwe_dimension,
-                       remainder1.len);
+  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], remainder,
+                              remainder1.data, remainder2.data,
+                              radix_params.big_lwe_dimension, remainder1.len);
 
   for (uint j = 0; j < gpu_count; j++) {
     cuda_synchronize_stream(streams[j], gpu_indexes[j]);
@@ -636,12 +636,14 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
       cuda_synchronize_stream(streams[j], gpu_indexes[j]);
     }
 
-    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
-                               gpu_count, positive_numerator.data, bsks, ksks,
-                               int_mem_ptr->abs_mem_1, true, num_blocks);
-    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
-                               gpu_count, positive_divisor.data, bsks, ksks,
-                               int_mem_ptr->abs_mem_2, true, num_blocks);
+    legacy_host_integer_abs_kb_async<Torus>(
+        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+        positive_numerator.data, bsks, ksks, int_mem_ptr->abs_mem_1, true,
+        num_blocks);
+    legacy_host_integer_abs_kb_async<Torus>(
+        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+        positive_divisor.data, bsks, ksks, int_mem_ptr->abs_mem_2, true,
+        num_blocks);
     for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -689,12 +691,12 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
         int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks, requested_flag,
         uses_carry);
 
-    host_integer_radix_cmux_kb<Torus>(
+    legacy_host_integer_radix_cmux_kb<Torus>(
         int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
         int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
         quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks, num_blocks);
 
-    host_integer_radix_cmux_kb<Torus>(
+    legacy_host_integer_radix_cmux_kb<Torus>(
         int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder,
         &numerator[big_lwe_size * (num_blocks - 1)],
         int_mem_ptr->negated_remainder, remainder,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index f6e53b5050..e85877211b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -1163,64 +1163,6 @@ void host_compute_shifted_blocks_and_borrow_states(
       big_lwe_size_bytes * num_radix_blocks, streams[0], gpu_indexes[0]);
 }
 
-template <typename Torus>
-void host_legacy_propagate_single_carry(cudaStream_t const *streams,
-                                        uint32_t const *gpu_indexes,
-                                        uint32_t gpu_count, Torus *lwe_array,
-                                        Torus *carry_out, Torus *input_carries,
-                                        int_legacy_sc_prop_memory<Torus> *mem,
-                                        void *const *bsks, Torus *const *ksks,
-                                        uint32_t num_blocks) {
-  auto params = mem->params;
-  if (params.message_modulus == 2)
-    PANIC("Cuda error: single carry propagation is not supported for 1 bit "
-          "messages")
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto big_lwe_size = glwe_dimension * polynomial_size + 1;
-  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  auto generates_or_propagates = mem->generates_or_propagates;
-  auto step_output = mem->step_output;
-
-  auto luts_array = mem->luts_array;
-  auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum;
-  auto message_acc = mem->message_acc;
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks,
-      ksks, num_blocks, luts_array);
-
-  // compute prefix sum with hillis&steele
-  host_compute_prefix_sum_hillis_steele<Torus>(
-      streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
-      params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
-
-  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
-                                        step_output, generates_or_propagates, 1,
-                                        num_blocks, big_lwe_size);
-  if (carry_out != nullptr) {
-    cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
-                                 streams[0], gpu_indexes[0]);
-  }
-  cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
-                    gpu_indexes[0]);
-
-  if (input_carries != nullptr) {
-    cuda_memcpy_async_gpu_to_gpu((void *)input_carries, step_output,
-                                 big_lwe_size_bytes * num_blocks, streams[0],
-                                 gpu_indexes[0]);
-  }
-
-  host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                       step_output, glwe_dimension * polynomial_size,
-                       num_blocks);
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
-      num_blocks, message_acc);
-}
-
 template <typename Torus>
 void host_generate_last_block_inner_propagation(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1336,10 +1278,10 @@ void host_full_propagate_inplace(cudaStream_t const *streams,
 
     if (i < num_blocks - 1) {
       auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
-      host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
-                           (Torus const *)next_input_block,
-                           &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                           params.big_lwe_dimension, 1);
+      legacy_host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
+                                  (Torus const *)next_input_block,
+                                  &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
+                                  params.big_lwe_dimension, 1);
     }
   }
 }
@@ -1703,8 +1645,8 @@ void host_propagate_single_carry(cudaStream_t const *streams,
     PANIC("Cuda error: single carry propagation is not supported for overflow, "
           "try using add_and_propagate_single_carry");
   if (uses_carry == 1) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                         input_carries, big_lwe_dimension, 1);
+    legacy_host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array,
+                                lwe_array, input_carries, big_lwe_dimension, 1);
   }
   // Step 1
   host_compute_shifted_blocks_and_states<Torus>(
@@ -1728,17 +1670,18 @@ void host_propagate_single_carry(cudaStream_t const *streams,
 
   auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
   auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
-  host_addition<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
-                       shifted_blocks,
-                       mem->prop_simu_group_carries_mem->simulators,
-                       big_lwe_dimension, num_radix_blocks);
+  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
+                              shifted_blocks,
+                              mem->prop_simu_group_carries_mem->simulators,
+                              big_lwe_dimension, num_radix_blocks);
 
   if (requested_flag == outputFlag::FLAG_OVERFLOW ||
       requested_flag == outputFlag::FLAG_CARRY) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
-                         mem->prop_simu_group_carries_mem->simulators +
-                             (num_radix_blocks - 1) * big_lwe_size,
-                         big_lwe_dimension, 1);
+    legacy_host_addition<Torus>(streams[0], gpu_indexes[0], output_flag,
+                                output_flag,
+                                mem->prop_simu_group_carries_mem->simulators +
+                                    (num_radix_blocks - 1) * big_lwe_size,
+                                big_lwe_dimension, 1);
   }
 
   host_radix_sum_in_groups<Torus>(
@@ -1746,10 +1689,11 @@ void host_propagate_single_carry(cudaStream_t const *streams,
       mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks,
       big_lwe_size, group_size);
   if (requested_flag == outputFlag::FLAG_CARRY) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
-                         mem->prop_simu_group_carries_mem->resolved_carries +
-                             (mem->num_groups - 1) * big_lwe_size,
-                         big_lwe_dimension, 1);
+    legacy_host_addition<Torus>(
+        streams[0], gpu_indexes[0], output_flag, output_flag,
+        mem->prop_simu_group_carries_mem->resolved_carries +
+            (mem->num_groups - 1) * big_lwe_size,
+        big_lwe_dimension, 1);
 
     cuda_memcpy_async_gpu_to_gpu(
         prepared_blocks + num_radix_blocks * big_lwe_size, output_flag,
@@ -1800,12 +1744,12 @@ void host_add_and_propagate_single_carry(
         big_lwe_size_bytes, streams[0], gpu_indexes[0]);
   }
 
-  host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
-                       rhs_array, big_lwe_dimension, num_radix_blocks);
+  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
+                              rhs_array, big_lwe_dimension, num_radix_blocks);
 
   if (uses_carry == 1) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
-                         input_carries, big_lwe_dimension, 1);
+    legacy_host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array,
+                                lhs_array, input_carries, big_lwe_dimension, 1);
   }
   // Step 1
   host_compute_shifted_blocks_and_states<Torus>(
@@ -1835,17 +1779,18 @@ void host_add_and_propagate_single_carry(
 
   auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
   auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
-  host_addition<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
-                       shifted_blocks,
-                       mem->prop_simu_group_carries_mem->simulators,
-                       big_lwe_dimension, num_radix_blocks);
+  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], prepared_blocks,
+                              shifted_blocks,
+                              mem->prop_simu_group_carries_mem->simulators,
+                              big_lwe_dimension, num_radix_blocks);
 
   if (requested_flag == outputFlag::FLAG_OVERFLOW ||
       requested_flag == outputFlag::FLAG_CARRY) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
-                         mem->prop_simu_group_carries_mem->simulators +
-                             (num_radix_blocks - 1) * big_lwe_size,
-                         big_lwe_dimension, 1);
+    legacy_host_addition<Torus>(streams[0], gpu_indexes[0], output_flag,
+                                output_flag,
+                                mem->prop_simu_group_carries_mem->simulators +
+                                    (num_radix_blocks - 1) * big_lwe_size,
+                                big_lwe_dimension, 1);
   }
 
   // Step 3
@@ -1859,15 +1804,17 @@ void host_add_and_propagate_single_carry(
       requested_flag == outputFlag::FLAG_CARRY) {
     if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW &&
         uses_carry == 1) {
-      host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
-                           input_carries, big_lwe_dimension, 1);
+      legacy_host_addition<Torus>(streams[0], gpu_indexes[0], output_flag,
+                                  output_flag, input_carries, big_lwe_dimension,
+                                  1);
 
     } else {
 
-      host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
-                           mem->prop_simu_group_carries_mem->resolved_carries +
-                               (mem->num_groups - 1) * big_lwe_size,
-                           big_lwe_dimension, 1);
+      legacy_host_addition<Torus>(
+          streams[0], gpu_indexes[0], output_flag, output_flag,
+          mem->prop_simu_group_carries_mem->resolved_carries +
+              (mem->num_groups - 1) * big_lwe_size,
+          big_lwe_dimension, 1);
     }
     cuda_memcpy_async_gpu_to_gpu(
         prepared_blocks + num_radix_blocks * big_lwe_size, output_flag,
@@ -1960,11 +1907,11 @@ void host_single_borrow_propagate(
       num_radix_blocks, message_modulus, carry_modulus);
 
   if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
-                         mem->overflow_block,
-                         mem->prop_simu_group_carries_mem->simulators +
-                             (num_radix_blocks - 1) * big_lwe_size,
-                         big_lwe_dimension, 1);
+    legacy_host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
+                                mem->overflow_block,
+                                mem->prop_simu_group_carries_mem->simulators +
+                                    (num_radix_blocks - 1) * big_lwe_size,
+                                big_lwe_dimension, 1);
   }
   auto resolved_borrows = mem->prop_simu_group_carries_mem->resolved_carries;
 
@@ -1972,10 +1919,10 @@ void host_single_borrow_propagate(
   //  This needs to be done before because in next step we modify the resolved
   //  borrows
   if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
-                         mem->overflow_block,
-                         resolved_borrows + (num_groups - 1) * big_lwe_size,
-                         big_lwe_dimension, 1);
+    legacy_host_addition<Torus>(
+        streams[0], gpu_indexes[0], mem->overflow_block, mem->overflow_block,
+        resolved_borrows + (num_groups - 1) * big_lwe_size, big_lwe_dimension,
+        1);
   }
 
   cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 590b41e101..a828f70b62 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -228,9 +228,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
                                  streams[0], gpu_indexes[0]);
   }
   if (num_radix_in_vec == 2) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                         &old_blocks[num_blocks * big_lwe_size],
-                         big_lwe_dimension, num_blocks);
+    legacy_host_addition<Torus>(
+        streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+        &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks);
     return;
   }
 
@@ -445,9 +445,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
   luts_message_carry->release(streams, gpu_indexes, gpu_count);
   delete (luts_message_carry);
 
-  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                       &old_blocks[num_blocks * big_lwe_size],
-                       big_lwe_dimension, num_blocks);
+  legacy_host_addition<Torus>(
+      streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+      &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks);
 }
 
 template <typename Torus, class params>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu
new file mode 100644
index 0000000000..f8c072f68f
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu
@@ -0,0 +1,87 @@
+#include "integer/radix_ciphertext.h"
+
+template <typename Torus>
+void create_trivial_radix_ciphertext_async(cudaStream_t const stream,
+                                           uint32_t const gpu_index,
+                                           CudaRadixCiphertextFFI *output_radix,
+                                           uint32_t num_radix_blocks,
+                                           uint32_t lwe_dimension) {
+  uint32_t lwe_size_bytes = (lwe_dimension + 1) * sizeof(Torus);
+  output_radix->ptr = (void *)cuda_malloc_async(
+      num_radix_blocks * lwe_size_bytes, stream, gpu_index);
+  output_radix->degrees = (Torus *)(malloc(num_radix_blocks * sizeof(Torus)));
+  output_radix->noise_levels =
+      (Torus *)(malloc(num_radix_blocks * sizeof(Torus)));
+  for (uint i = 0; i < output_radix->num_radix_blocks; i++) {
+    output_radix->degrees[i] = 0;
+    output_radix->noise_levels[i] = 0;
+  }
+  output_radix->lwe_dimension = lwe_dimension;
+  output_radix->num_radix_blocks = num_radix_blocks;
+}
+
+// end_lwe_index is inclusive
+template <typename Torus>
+void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
+                               const CudaRadixCiphertextFFI *input_radix,
+                               uint32_t start_lwe_index,
+                               uint32_t end_lwe_index) {
+  if (input_radix->num_radix_blocks < start_lwe_index - end_lwe_index + 1)
+    PANIC("Cuda error: input radix should have more blocks than the specified "
+          "range")
+  if (start_lwe_index <= end_lwe_index)
+    PANIC("Cuda error: slice range should be strictly positive")
+
+  auto lwe_size = input_radix->lwe_dimension + 1;
+  output_radix->num_radix_blocks = end_lwe_index - start_lwe_index + 1;
+  output_radix->lwe_dimension = input_radix->lwe_dimension;
+  Torus *in_ptr = (Torus *)input_radix->ptr;
+  output_radix->ptr = (void *)(&in_ptr[start_lwe_index * lwe_size]);
+  for (uint i = 0; i < output_radix->num_radix_blocks; i++) {
+    output_radix->degrees[i] =
+        input_radix->degrees[i + start_lwe_index * lwe_size];
+    output_radix->noise_levels[i] =
+        input_radix->noise_levels[i + start_lwe_index * lwe_size];
+  }
+}
+
+template <typename Torus>
+void copy_radix_ciphertext_to_larger_output_slice_async(
+    cudaStream_t const stream, uint32_t const gpu_index,
+    CudaRadixCiphertextFFI *output_radix,
+    const CudaRadixCiphertextFFI *input_radix,
+    uint32_t output_start_lwe_index) {
+  if (output_radix->lwe_dimension != input_radix->lwe_dimension)
+    PANIC("Cuda error: input lwe dimension should be equal to output lwe "
+          "dimension")
+  if (output_radix->num_radix_blocks - output_start_lwe_index !=
+      input_radix->num_radix_blocks)
+    PANIC("Cuda error: input radix should have the same number of blocks as "
+          "the output range")
+  if (output_start_lwe_index >= output_radix->num_radix_blocks)
+    PANIC("Cuda error: output index should be strictly smaller than the number "
+          "of blocks")
+
+  auto lwe_size = input_radix->lwe_dimension + 1;
+  Torus *out_ptr = (Torus *)output_radix->ptr;
+  out_ptr = &out_ptr[output_start_lwe_index * lwe_size];
+
+  cuda_memcpy_async_gpu_to_gpu(out_ptr, input_radix->ptr,
+                               input_radix->num_radix_blocks *
+                                   (input_radix->lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               stream, gpu_index);
+  for (uint i = 0; i < input_radix->num_radix_blocks; i++) {
+    output_radix->degrees[i + output_start_lwe_index] = input_radix->degrees[i];
+    output_radix->noise_levels[i + output_start_lwe_index] =
+        input_radix->noise_levels[i];
+  }
+}
+
+void release_radix_ciphertext_data(cudaStream_t const stream,
+                                   uint32_t const gpu_index,
+                                   CudaRadixCiphertextFFI *data) {
+  cuda_drop_async(data->ptr, stream, gpu_index);
+  free(data->degrees);
+  free(data->noise_levels);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
index 4b79a24cec..e70489bf2a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -540,10 +540,10 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
 
   // Selector
   // CMUX for Max or Min
-  host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
-                                    lwe_array_out, sign, lwe_array_left,
-                                    lwe_array_right, mem_ptr->cmux_buffer, bsks,
-                                    ksks, total_num_radix_blocks);
+  legacy_host_integer_radix_cmux_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, sign, lwe_array_left,
+      lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
+      total_num_radix_blocks);
 }
 
 template <typename Torus>
@@ -621,7 +621,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
 
   // Selector
   // CMUX for Max or Min
-  host_integer_radix_cmux_kb<Torus>(
+  legacy_host_integer_radix_cmux_kb<Torus>(
       streams, gpu_indexes, gpu_count, lwe_array_out,
       mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
       mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
index 7a6bfb85ea..5f770ec2b4 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -144,10 +144,10 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
 
     // The shift bit is already properly aligned/positioned
     for (int i = 0; i < total_nb_bits; i++)
-      host_addition<Torus>(streams[0], gpu_indexes[0],
-                           mux_inputs + i * big_lwe_size,
-                           mux_inputs + i * big_lwe_size, shift_bit,
-                           mem->params.big_lwe_dimension, 1);
+      legacy_host_addition<Torus>(streams[0], gpu_indexes[0],
+                                  mux_inputs + i * big_lwe_size,
+                                  mux_inputs + i * big_lwe_size, shift_bit,
+                                  mem->params.big_lwe_dimension, 1);
 
     // we have
     // control_bit|b|a
@@ -180,8 +180,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
     auto bit_to_add = input_bits_a + i * big_lwe_size;
 
     for (int j = 0; j < num_radix_blocks; j++) {
-      host_addition<Torus>(streams[0], gpu_indexes[0], block, block, bit_to_add,
-                           big_lwe_dimension, 1);
+      legacy_host_addition<Torus>(streams[0], gpu_indexes[0], block, block,
+                                  bit_to_add, big_lwe_dimension, 1);
 
       block += big_lwe_size;
       bit_to_add += bits_per_block * big_lwe_size;
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
index 2b67655ad6..24bdaf1f46 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -1,20 +1,13 @@
+#include "integer/integer.h"
 #include "linearalgebra/addition.cuh"
 
-/*
- * Perform the addition of two u32 input LWE ciphertext vectors.
- * See the equivalent operation on u64 ciphertexts for more details.
- */
 void cuda_add_lwe_ciphertext_vector_32(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in_1, void const *lwe_array_in_2,
-    const uint32_t input_lwe_dimension,
-    const uint32_t input_lwe_ciphertext_count) {
+    void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in_1,
+    CudaRadixCiphertextFFI const *lwe_array_in_2) {
 
   host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint32_t *>(lwe_array_out),
-                          static_cast<const uint32_t *>(lwe_array_in_1),
-                          static_cast<const uint32_t *>(lwe_array_in_2),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+                          lwe_array_out, lwe_array_in_1, lwe_array_in_2);
 }
 
 /*
@@ -44,16 +37,12 @@ void cuda_add_lwe_ciphertext_vector_32(
  * that performs the operation on the GPU.
  */
 void cuda_add_lwe_ciphertext_vector_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in_1, void const *lwe_array_in_2,
-    const uint32_t input_lwe_dimension,
-    const uint32_t input_lwe_ciphertext_count) {
+    void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in_1,
+    CudaRadixCiphertextFFI const *lwe_array_in_2) {
 
   host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint64_t *>(lwe_array_out),
-                          static_cast<const uint64_t *>(lwe_array_in_1),
-                          static_cast<const uint64_t *>(lwe_array_in_2),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+                          lwe_array_out, lwe_array_in_1, lwe_array_in_2);
 }
 
 /*
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
index 57342d3544..256bccb6e1 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -8,6 +8,7 @@
 
 #include "device.h"
 #include "helper_multi_gpu.h"
+#include "integer/integer.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 #include <stdio.h>
@@ -100,10 +101,40 @@ __global__ void addition(T *output, T const *input_1, T const *input_2,
 
 // Coefficient-wise addition
 template <typename T>
-__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
-                            T const *input_1, T const *input_2,
-                            const uint32_t input_lwe_dimension,
-                            const uint32_t input_lwe_ciphertext_count) {
+__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index,
+                            CudaRadixCiphertextFFI *output,
+                            CudaRadixCiphertextFFI const *input_1,
+                            CudaRadixCiphertextFFI const *input_2) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = output->lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = output->num_radix_blocks * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  addition<T><<<grid, thds, 0, stream>>>(
+      static_cast<T *>(output->ptr), static_cast<const T *>(input_1->ptr),
+      static_cast<const T *>(input_2->ptr), num_entries);
+  check_cuda_error(cudaGetLastError());
+  for (uint i = 0; i < output->num_radix_blocks; i++) {
+    output->degrees[i] = input_1->degrees[i] + input_2->degrees[i];
+    output->noise_levels[i] =
+        input_1->noise_levels[i] + input_2->noise_levels[i];
+  }
+}
+
+// Coefficient-wise addition
+template <typename T>
+__host__ void legacy_host_addition(cudaStream_t stream, uint32_t gpu_index,
+                                   T *output, T const *input_1,
+                                   T const *input_2,
+                                   const uint32_t input_lwe_dimension,
+                                   const uint32_t input_lwe_ciphertext_count) {
 
   cudaSetDevice(gpu_index);
   // lwe_size includes the presence of the body
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index fa16da0136..1a805c497f 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1,8 +1,8 @@
-/* automatically generated by rust-bindgen 0.70.1 */
+/* automatically generated by rust-bindgen 0.71.1 */
 
 use crate::ffi;
 
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -12,7 +12,7 @@ extern "C" {
         lwe_dimension: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -22,7 +22,7 @@ extern "C" {
         lwe_dimension: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_glwe_sample_extract_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -37,7 +37,7 @@ extern "C" {
 pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
 pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
 pub type PBS_TYPE = ffi::c_uint;
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -57,7 +57,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -79,7 +79,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_compress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -91,7 +91,7 @@ extern "C" {
         mem_ptr: *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_decompress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -104,7 +104,7 @@ extern "C" {
         mem_ptr: *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -112,7 +112,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -141,7 +141,32 @@ pub const COMPARISON_TYPE_LE: COMPARISON_TYPE = 5;
 pub const COMPARISON_TYPE_MAX: COMPARISON_TYPE = 6;
 pub const COMPARISON_TYPE_MIN: COMPARISON_TYPE = 7;
 pub type COMPARISON_TYPE = ffi::c_uint;
-extern "C" {
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaRadixCiphertextFFI {
+    pub ptr: *mut ffi::c_void,
+    pub degrees: *mut u64,
+    pub noise_levels: *mut u64,
+    pub num_radix_blocks: u32,
+    pub lwe_dimension: u32,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaRadixCiphertextFFI"][::std::mem::size_of::<CudaRadixCiphertextFFI>() - 32usize];
+    ["Alignment of CudaRadixCiphertextFFI"]
+        [::std::mem::align_of::<CudaRadixCiphertextFFI>() - 8usize];
+    ["Offset of field: CudaRadixCiphertextFFI::ptr"]
+        [::std::mem::offset_of!(CudaRadixCiphertextFFI, ptr) - 0usize];
+    ["Offset of field: CudaRadixCiphertextFFI::degrees"]
+        [::std::mem::offset_of!(CudaRadixCiphertextFFI, degrees) - 8usize];
+    ["Offset of field: CudaRadixCiphertextFFI::noise_levels"]
+        [::std::mem::offset_of!(CudaRadixCiphertextFFI, noise_levels) - 16usize];
+    ["Offset of field: CudaRadixCiphertextFFI::num_radix_blocks"]
+        [::std::mem::offset_of!(CudaRadixCiphertextFFI, num_radix_blocks) - 24usize];
+    ["Offset of field: CudaRadixCiphertextFFI::lwe_dimension"]
+        [::std::mem::offset_of!(CudaRadixCiphertextFFI, lwe_dimension) - 28usize];
+};
+unsafe extern "C" {
     pub fn scratch_cuda_apply_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -163,7 +188,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_apply_many_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -186,7 +211,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_apply_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -199,7 +224,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_apply_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -207,7 +232,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_apply_bivariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -229,7 +254,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_apply_bivariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -244,7 +269,7 @@ extern "C" {
         shift: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_apply_bivariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -252,7 +277,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_apply_many_univariate_lut_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -267,7 +292,7 @@ extern "C" {
         lut_stride: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_full_propagation_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -287,7 +312,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_full_propagation_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -299,7 +324,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_full_propagation(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -307,7 +332,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -330,7 +355,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_mult_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -347,7 +372,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_mult(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -355,7 +380,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_negate_integer_radix_ciphertext_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -368,7 +393,7 @@ extern "C" {
         carry_modulus: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -381,7 +406,7 @@ extern "C" {
         carry_modulus: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -404,7 +429,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -417,7 +442,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -440,7 +465,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -453,7 +478,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_logical_scalar_shift(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -461,7 +486,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -469,7 +494,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -493,7 +518,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_shift_and_rotate_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -506,7 +531,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_shift_and_rotate(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -514,7 +539,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_comparison_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -538,7 +563,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_comparison_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -552,7 +577,7 @@ extern "C" {
         lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -567,7 +592,7 @@ extern "C" {
         num_scalar_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_comparison(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -575,7 +600,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_bitop_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -598,7 +623,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_bitop_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -612,7 +637,7 @@ extern "C" {
         lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -628,7 +653,7 @@ extern "C" {
         op: BITOP_TYPE,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_bitop(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -636,7 +661,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_cmux_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -658,22 +683,21 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_cmux_integer_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_condition: *const ffi::c_void,
-        lwe_array_true: *const ffi::c_void,
-        lwe_array_false: *const ffi::c_void,
+        lwe_array_out: *mut CudaRadixCiphertextFFI,
+        lwe_condition: *const CudaRadixCiphertextFFI,
+        lwe_array_true: *const CudaRadixCiphertextFFI,
+        lwe_array_false: *const CudaRadixCiphertextFFI,
         mem_ptr: *mut i8,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
-        lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_cmux(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -681,7 +705,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -704,7 +728,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -717,7 +741,7 @@ extern "C" {
         num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_scalar_rotate(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -725,7 +749,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_propagate_single_carry_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -749,7 +773,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -773,7 +797,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_propagate_single_carry_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -789,7 +813,7 @@ extern "C" {
         uses_carry: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_and_propagate_single_carry_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -806,7 +830,7 @@ extern "C" {
         uses_carry: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_propagate_single_carry(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -814,7 +838,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_add_and_propagate_single_carry(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -822,7 +846,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -845,7 +869,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_overflowing_sub_kb_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -862,7 +886,7 @@ extern "C" {
         uses_input_borrow: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_overflowing_sub(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -870,7 +894,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -892,7 +916,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -906,7 +930,7 @@ extern "C" {
         num_blocks_in_radix: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -914,7 +938,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_scalar_mul_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -935,7 +959,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -953,7 +977,7 @@ extern "C" {
         num_scalars: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_radix_scalar_mul(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -961,7 +985,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -984,7 +1008,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_div_rem_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1000,7 +1024,7 @@ extern "C" {
         num_blocks_in_radix: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_div_rem(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1008,7 +1032,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1030,7 +1054,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1044,7 +1068,7 @@ extern "C" {
         shift: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1052,7 +1076,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_reverse_blocks_64_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1062,7 +1086,7 @@ extern "C" {
         lwe_size: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1085,20 +1109,19 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        ct: *mut ffi::c_void,
+        ct: *mut CudaRadixCiphertextFFI,
         mem_ptr: *mut i8,
         is_signed: bool,
         bsks: *const *mut ffi::c_void,
         ksks: *const *mut ffi::c_void,
-        num_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_abs_inplace(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1106,7 +1129,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1128,7 +1151,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_are_all_comparisons_block_true_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1141,7 +1164,7 @@ extern "C" {
         num_radix_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_are_all_comparisons_block_true(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1149,7 +1172,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1171,7 +1194,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1184,7 +1207,7 @@ extern "C" {
         num_radix_blocks: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
@@ -1192,7 +1215,7 @@ extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_keyswitch_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1208,7 +1231,7 @@ extern "C" {
         num_samples: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_keyswitch_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1224,7 +1247,7 @@ extern "C" {
         num_samples: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1236,7 +1259,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1252,14 +1275,14 @@ extern "C" {
         num_lwes: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_packing_keyswitch_lwe_list_to_glwe(
         stream: *mut ffi::c_void,
         gpu_index: u32,
         fp_ks_buffer: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_negate_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1269,7 +1292,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_negate_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1279,29 +1302,25 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_array_in_1: *const ffi::c_void,
-        lwe_array_in_2: *const ffi::c_void,
-        input_lwe_dimension: u32,
-        input_lwe_ciphertext_count: u32,
+        lwe_array_out: *mut CudaRadixCiphertextFFI,
+        lwe_array_in_1: *const CudaRadixCiphertextFFI,
+        lwe_array_in_2: *const CudaRadixCiphertextFFI,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_array_in_1: *const ffi::c_void,
-        lwe_array_in_2: *const ffi::c_void,
-        input_lwe_dimension: u32,
-        input_lwe_ciphertext_count: u32,
+        lwe_array_out: *mut CudaRadixCiphertextFFI,
+        lwe_array_in_1: *const CudaRadixCiphertextFFI,
+        lwe_array_in_2: *const CudaRadixCiphertextFFI,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1312,7 +1331,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1323,7 +1342,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1334,7 +1353,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1345,7 +1364,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_add_lwe_ciphertext_vector_plaintext_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1356,7 +1375,7 @@ extern "C" {
         input_lwe_ciphertext_count: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_fourier_polynomial_mul(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1367,7 +1386,7 @@ extern "C" {
         total_polynomials: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_programmable_bootstrap_key_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1379,7 +1398,7 @@ extern "C" {
         polynomial_size: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_programmable_bootstrap_key_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1391,7 +1410,7 @@ extern "C" {
         polynomial_size: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_amortized_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1402,7 +1421,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_amortized_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1413,7 +1432,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1433,7 +1452,7 @@ extern "C" {
         num_samples: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1453,14 +1472,14 @@ extern "C" {
         num_samples: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_programmable_bootstrap_amortized(
         stream: *mut ffi::c_void,
         gpu_index: u32,
         pbs_buffer: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1472,7 +1491,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1484,7 +1503,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1506,7 +1525,7 @@ extern "C" {
         lut_stride: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1528,14 +1547,14 @@ extern "C" {
         lut_stride: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_programmable_bootstrap(
         stream: *mut ffi::c_void,
         gpu_index: u32,
         pbs_buffer: *mut *mut i8,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
         glwe_dimension: u32,
         polynomial_size: u32,
@@ -1543,7 +1562,7 @@ extern "C" {
         num_samples: u32,
     ) -> bool;
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1556,7 +1575,7 @@ extern "C" {
         grouping_factor: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1568,7 +1587,7 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
@@ -1591,7 +1610,7 @@ extern "C" {
         lut_stride: u32,
     );
 }
-extern "C" {
+unsafe extern "C" {
     pub fn cleanup_cuda_multi_bit_programmable_bootstrap(
         stream: *mut ffi::c_void,
         gpu_index: u32,
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 7ed226c3f3..7ad081f306 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -421,14 +421,41 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
     lwe_dimension: LweDimension,
     num_samples: u32,
 ) {
+    let mut degrees_vec: Vec<u64> = Vec::with_capacity((num_samples as usize) + 1)
+        .iter()
+        .map(|_: &u64| 0)
+        .collect();
+    let mut noise_levels_vec: Vec<u64> = Vec::with_capacity((num_samples as usize) + 1)
+        .iter()
+        .map(|_: &u64| 0)
+        .collect();
+    let mut lwe_array_out_data = CudaRadixCiphertextFFI {
+        ptr: lwe_array_out.as_mut_c_ptr(0),
+        degrees: degrees_vec.as_mut_ptr(),
+        noise_levels: noise_levels_vec.as_mut_ptr(),
+        num_radix_blocks: num_samples,
+        lwe_dimension: lwe_dimension.0 as u32,
+    };
+    let lwe_array_in_1_data = CudaRadixCiphertextFFI {
+        ptr: lwe_array_in_1.get_mut_c_ptr(0),
+        degrees: degrees_vec.as_mut_ptr(),
+        noise_levels: noise_levels_vec.as_mut_ptr(),
+        num_radix_blocks: num_samples,
+        lwe_dimension: lwe_dimension.0 as u32,
+    };
+    let lwe_array_in_2_data = CudaRadixCiphertextFFI {
+        ptr: lwe_array_in_2.get_mut_c_ptr(0),
+        degrees: degrees_vec.as_mut_ptr(),
+        noise_levels: noise_levels_vec.as_mut_ptr(),
+        num_radix_blocks: num_samples,
+        lwe_dimension: lwe_dimension.0 as u32,
+    };
     cuda_add_lwe_ciphertext_vector_64(
         streams.ptr[0],
         streams.gpu_indexes[0].0,
-        lwe_array_out.as_mut_c_ptr(0),
-        lwe_array_in_1.as_c_ptr(0),
-        lwe_array_in_2.as_c_ptr(0),
-        lwe_dimension.0 as u32,
-        num_samples,
+        &mut lwe_array_out_data,
+        &lwe_array_in_1_data,
+        &lwe_array_in_2_data,
     );
 }
 
@@ -445,14 +472,34 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
     lwe_dimension: LweDimension,
     num_samples: u32,
 ) {
+    let mut degrees_vec: Vec<u64> = Vec::with_capacity((num_samples as usize) + 1)
+        .iter()
+        .map(|_: &u64| 0)
+        .collect();
+    let mut noise_levels_vec: Vec<u64> = Vec::with_capacity((num_samples as usize) + 1)
+        .iter()
+        .map(|_: &u64| 0)
+        .collect();
+    let mut lwe_array_out_data = CudaRadixCiphertextFFI {
+        ptr: lwe_array_out.as_mut_c_ptr(0),
+        degrees: degrees_vec.as_mut_ptr(),
+        noise_levels: noise_levels_vec.as_mut_ptr(),
+        num_radix_blocks: num_samples,
+        lwe_dimension: lwe_dimension.0 as u32,
+    };
+    let lwe_array_in_data = CudaRadixCiphertextFFI {
+        ptr: lwe_array_in.get_mut_c_ptr(0),
+        degrees: degrees_vec.as_mut_ptr(),
+        noise_levels: noise_levels_vec.as_mut_ptr(),
+        num_radix_blocks: num_samples,
+        lwe_dimension: lwe_dimension.0 as u32,
+    };
     cuda_add_lwe_ciphertext_vector_64(
         streams.ptr[0],
         streams.gpu_indexes[0].0,
-        lwe_array_out.as_mut_c_ptr(0),
-        lwe_array_out.as_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
-        lwe_dimension.0 as u32,
-        num_samples,
+        &mut lwe_array_out_data,
+        &lwe_array_out_data,
+        &lwe_array_in_data,
     );
 }
 
diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs
index 0a6316a02c..613d8d61ed 100644
--- a/tfhe/src/integer/gpu/ciphertext/info.rs
+++ b/tfhe/src/integer/gpu/ciphertext/info.rs
@@ -72,23 +72,6 @@ impl CudaRadixCiphertextInfo {
         Some(decomposer)
     }
 
-    pub(crate) fn after_add(&self, other: &Self) -> Self {
-        Self {
-            blocks: self
-                .blocks
-                .iter()
-                .zip(&other.blocks)
-                .map(|(left, right)| CudaBlockInfo {
-                    degree: left.degree + right.degree,
-                    message_modulus: left.message_modulus,
-                    carry_modulus: left.carry_modulus,
-                    pbs_order: left.pbs_order,
-                    noise_level: left.noise_level + right.noise_level,
-                })
-                .collect(),
-        }
-    }
-
     pub(crate) fn after_neg(&self) -> Self {
         let new_degrees_iter = NegatedDegreeIter::new(
             self.blocks
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index 6fc3dcac4b..542c77e8b1 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -15,7 +15,10 @@ use crate::shortint::{CarryModulus, MessageModulus};
 pub use server_key::CudaServerKey;
 use std::cmp::min;
 
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
+use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
 use crate::integer::server_key::radix_parallel::OutputFlag;
+use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use tfhe_cuda_backend::bindings::*;
 use tfhe_cuda_backend::cuda_bind::*;
 
@@ -57,6 +60,45 @@ pub enum ComparisonType {
     MIN = 7,
 }
 
+fn prepare_cuda_radix_ciphertext_ffi(input: &CudaRadixCiphertext) -> CudaRadixCiphertextFFI {
+    let mut degrees_vec: Vec<u64> = input.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut noise_levels_vec: Vec<u64> =
+        input.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    CudaRadixCiphertextFFI {
+        ptr: input.d_blocks.0.d_vec.get_mut_c_ptr(0),
+        degrees: degrees_vec.as_mut_ptr(),
+        noise_levels: noise_levels_vec.as_mut_ptr(),
+        num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32,
+        lwe_dimension: input.d_blocks.0.lwe_dimension.0 as u32,
+    }
+}
+
+fn prepare_cuda_boolean_block_ffi(input: &CudaBooleanBlock) -> CudaRadixCiphertextFFI {
+    let mut degrees_vec: Vec<u64> = input
+        .0
+        .ciphertext
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.0)
+        .collect();
+    let mut noise_levels_vec: Vec<u64> = input
+        .0
+        .ciphertext
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    CudaRadixCiphertextFFI {
+        ptr: input.0.ciphertext.d_blocks.0.d_vec.get_mut_c_ptr(0),
+        degrees: degrees_vec.as_mut_ptr(),
+        noise_levels: noise_levels_vec.as_mut_ptr(),
+        num_radix_blocks: input.0.ciphertext.d_blocks.0.lwe_ciphertext_count.0 as u32,
+        lwe_dimension: input.0.ciphertext.d_blocks.0.lwe_dimension.0 as u32,
+    }
+}
+
 pub fn gen_keys_gpu<P>(parameters_set: P, streams: &CudaStreams) -> (ClientKey, CudaServerKey)
 where
     P: TryInto<crate::shortint::parameters::ShortintParameterSet>,
@@ -490,32 +532,40 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
 ///
 /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
 ///   is required
-pub unsafe fn unchecked_add_integer_radix_assign_async<T: UnsignedInteger>(
+pub unsafe fn unchecked_add_integer_radix_assign(
     streams: &CudaStreams,
-    radix_lwe_left: &mut CudaVec<T>,
-    radix_lwe_right: &CudaVec<T>,
-    lwe_dimension: LweDimension,
-    num_blocks: u32,
+    radix_lwe_left: &mut CudaRadixCiphertext,
+    radix_lwe_right: &CudaRadixCiphertext,
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_left.gpu_index(0),
+        radix_lwe_left.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_right.gpu_index(0),
+        radix_lwe_right.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
+    let mut radix_lwe_left_data = prepare_cuda_radix_ciphertext_ffi(radix_lwe_left);
+    let radix_lwe_right_data = prepare_cuda_radix_ciphertext_ffi(radix_lwe_right);
     cuda_add_lwe_ciphertext_vector_64(
         streams.ptr[0],
         streams.gpu_indexes[0].0,
-        radix_lwe_left.as_mut_c_ptr(0),
-        radix_lwe_left.as_c_ptr(0),
-        radix_lwe_right.as_c_ptr(0),
-        lwe_dimension.0 as u32,
-        num_blocks,
-    );
+        &mut radix_lwe_left_data,
+        &radix_lwe_left_data,
+        &radix_lwe_right_data,
+    );
+    radix_lwe_left
+        .info
+        .blocks
+        .iter_mut()
+        .enumerate()
+        .for_each(|(i, b)| {
+            b.degree = Degree(*radix_lwe_left_data.degrees.wrapping_add(i));
+            b.noise_level = NoiseLevel(*radix_lwe_left_data.noise_levels.wrapping_add(i));
+        });
+    streams.synchronize();
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -2089,12 +2139,12 @@ pub unsafe fn unchecked_rotate_left_integer_radix_kb_assign_async<
 ///
 /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
 ///   is required
-pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numeric>(
+pub unsafe fn unchecked_cmux_integer_radix_kb<T: UnsignedInteger, B: Numeric>(
     streams: &CudaStreams,
-    radix_lwe_out: &mut CudaVec<T>,
-    radix_lwe_condition: &CudaVec<T>,
-    radix_lwe_true: &CudaVec<T>,
-    radix_lwe_false: &CudaVec<T>,
+    radix_lwe_out: &mut CudaRadixCiphertext,
+    radix_lwe_condition: &CudaBooleanBlock,
+    radix_lwe_true: &CudaRadixCiphertext,
+    radix_lwe_false: &CudaRadixCiphertext,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
     message_modulus: MessageModulus,
@@ -2113,22 +2163,28 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numer
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_out.gpu_index(0),
+        radix_lwe_out.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_condition.gpu_index(0),
+        radix_lwe_condition
+            .0
+            .ciphertext
+            .d_blocks
+            .0
+            .d_vec
+            .gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_true.gpu_index(0),
+        radix_lwe_true.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_false.gpu_index(0),
+        radix_lwe_false.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
@@ -2141,6 +2197,10 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numer
         keyswitch_key.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
+    let mut radix_lwe_out_data = prepare_cuda_radix_ciphertext_ffi(radix_lwe_out);
+    let radix_lwe_true_data = prepare_cuda_radix_ciphertext_ffi(radix_lwe_true);
+    let radix_lwe_false_data = prepare_cuda_radix_ciphertext_ffi(radix_lwe_false);
+    let condition_data = prepare_cuda_boolean_block_ffi(radix_lwe_condition);
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
     scratch_cuda_integer_radix_cmux_kb_64(
         streams.ptr.as_ptr(),
@@ -2176,14 +2236,13 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numer
             .collect::<Vec<u32>>()
             .as_ptr(),
         streams.len() as u32,
-        radix_lwe_out.as_mut_c_ptr(0),
-        radix_lwe_condition.as_c_ptr(0),
-        radix_lwe_true.as_c_ptr(0),
-        radix_lwe_false.as_c_ptr(0),
+        &mut radix_lwe_out_data,
+        &condition_data,
+        &radix_lwe_true_data,
+        &radix_lwe_false_data,
         mem_ptr,
         bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
-        num_blocks,
     );
     cleanup_cuda_integer_radix_cmux(
         streams.ptr.as_ptr(),
@@ -2196,6 +2255,7 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numer
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
     );
+    streams.synchronize()
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -3150,9 +3210,9 @@ pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_
 ///
 /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
 ///   is required
-pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B: Numeric>(
+pub unsafe fn unchecked_signed_abs_radix_kb_assign<T: UnsignedInteger, B: Numeric>(
     streams: &CudaStreams,
-    ct: &mut CudaVec<T>,
+    ct: &mut CudaRadixCiphertext,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
     message_modulus: MessageModulus,
@@ -3169,7 +3229,13 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B:
     pbs_type: PBSType,
     grouping_factor: LweBskGroupingFactor,
 ) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        ct.d_blocks.0.d_vec.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let mut ct_data = prepare_cuda_radix_ciphertext_ffi(ct);
     scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
         streams.ptr.as_ptr(),
         streams
@@ -3205,12 +3271,11 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B:
             .collect::<Vec<u32>>()
             .as_ptr(),
         streams.len() as u32,
-        ct.as_mut_c_ptr(0),
+        &mut ct_data,
         mem_ptr,
         true,
         bootstrapping_key.ptr.as_ptr(),
         keyswitch_key.ptr.as_ptr(),
-        num_blocks,
     );
     cleanup_cuda_integer_abs_inplace(
         streams.ptr.as_ptr(),
@@ -3223,6 +3288,7 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B:
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
     );
+    streams.synchronize()
 }
 
 #[allow(clippy::too_many_arguments)]
diff --git a/tfhe/src/integer/gpu/server_key/radix/abs.rs b/tfhe/src/integer/gpu/server_key/radix/abs.rs
index 94c66c7216..3d62cad558 100644
--- a/tfhe/src/integer/gpu/server_key/radix/abs.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/abs.rs
@@ -2,81 +2,82 @@ use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
-use crate::integer::gpu::{unchecked_signed_abs_radix_kb_assign_async, PBSType};
+use crate::integer::gpu::{unchecked_signed_abs_radix_kb_assign, PBSType};
 
 impl CudaServerKey {
     /// # Safety
     ///
     /// - [CudaStreams::synchronize] __must__ be called after this function as soon as
     ///   synchronization is required
-    pub unsafe fn unchecked_abs_assign_async<T>(&self, ct: &mut T, streams: &CudaStreams)
+    pub fn unchecked_abs_assign<T>(&self, ct: &mut T, streams: &CudaStreams)
     where
         T: CudaIntegerRadixCiphertext,
     {
         let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
 
-        match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_signed_abs_radix_kb_assign_async(
-                    streams,
-                    &mut ct.as_mut().d_blocks.0.d_vec,
-                    &d_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_bsk.glwe_dimension,
-                    d_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count,
-                    d_bsk.decomp_base_log,
-                    num_blocks,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                );
-            }
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_signed_abs_radix_kb_assign_async(
-                    streams,
-                    &mut ct.as_mut().d_blocks.0.d_vec,
-                    &d_multibit_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
-                    num_blocks,
-                    PBSType::MultiBit,
-                    d_multibit_bsk.grouping_factor,
-                );
-            }
-        };
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    unchecked_signed_abs_radix_kb_assign(
+                        streams,
+                        ct.as_mut(),
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        num_blocks,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    unchecked_signed_abs_radix_kb_assign(
+                        streams,
+                        ct.as_mut(),
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        num_blocks,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                    );
+                }
+            };
+        }
     }
     pub fn unchecked_abs<T>(&self, ct: &T, streams: &CudaStreams) -> T
     where
         T: CudaIntegerRadixCiphertext,
     {
-        let mut res = unsafe { ct.duplicate_async(streams) };
+        let mut res = ct.duplicate(streams);
         if T::IS_SIGNED {
-            unsafe { self.unchecked_abs_assign_async(&mut res, streams) };
+            self.unchecked_abs_assign(&mut res, streams);
         }
-        streams.synchronize();
         res
     }
 
@@ -130,14 +131,13 @@ impl CudaServerKey {
     where
         T: CudaIntegerRadixCiphertext,
     {
-        let mut res = unsafe { ct.duplicate_async(streams) };
+        let mut res = ct.duplicate(streams);
         if !ct.block_carries_are_empty() {
-            unsafe { self.full_propagate_assign_async(&mut res, streams) };
+            self.full_propagate_assign(&mut res, streams);
         };
         if T::IS_SIGNED {
-            unsafe { self.unchecked_abs_assign_async(&mut res, streams) };
+            self.unchecked_abs_assign(&mut res, streams);
         }
-        streams.synchronize();
         res
     }
 }
diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs
index 060fbee8c3..ffebbccd2c 100644
--- a/tfhe/src/integer/gpu/server_key/radix/add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -7,7 +7,7 @@ use crate::integer::gpu::ciphertext::{
 };
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
-    unchecked_add_integer_radix_assign_async,
+    unchecked_add_integer_radix_assign,
     unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, PBSType,
 };
 use crate::integer::server_key::radix_parallel::OutputFlag;
@@ -70,7 +70,7 @@ impl CudaServerKey {
         ct_right: &T,
         streams: &CudaStreams,
     ) -> T {
-        let mut result = unsafe { ct_left.duplicate_async(streams) };
+        let mut result = ct_left.duplicate(streams);
         self.add_assign(&mut result, ct_right, streams);
         result
     }
@@ -94,18 +94,18 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
-                self.full_propagate_assign_async(ct_left, streams);
+                self.full_propagate_assign(ct_left, streams);
                 (ct_left, ct_right)
             }
             (false, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(ct_left, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(ct_left, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
         };
@@ -179,7 +179,7 @@ impl CudaServerKey {
     ///
     /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
     ///   not be dropped until stream is synchronised
-    pub unsafe fn unchecked_add_assign_async<T: CudaIntegerRadixCiphertext>(
+    pub fn unchecked_add_assign<T: CudaIntegerRadixCiphertext>(
         &self,
         ct_left: &mut T,
         ct_right: &T,
@@ -203,30 +203,9 @@ impl CudaServerKey {
             ciphertext_right.d_blocks.ciphertext_modulus()
         );
 
-        let lwe_dimension = ciphertext_left.d_blocks.lwe_dimension();
-        let lwe_ciphertext_count = ciphertext_left.d_blocks.lwe_ciphertext_count();
-
-        unchecked_add_integer_radix_assign_async(
-            streams,
-            &mut ciphertext_left.d_blocks.0.d_vec,
-            &ciphertext_right.d_blocks.0.d_vec,
-            lwe_dimension,
-            lwe_ciphertext_count.0 as u32,
-        );
-
-        ciphertext_left.info = ciphertext_left.info.after_add(&ciphertext_right.info);
-    }
-
-    pub fn unchecked_add_assign<T: CudaIntegerRadixCiphertext>(
-        &self,
-        ct_left: &mut T,
-        ct_right: &T,
-        streams: &CudaStreams,
-    ) {
         unsafe {
-            self.unchecked_add_assign_async(ct_left, ct_right, streams);
+            unchecked_add_integer_radix_assign(streams, ciphertext_left, ciphertext_right);
         }
-        streams.synchronize();
     }
 
     /// # Safety
@@ -417,7 +396,7 @@ impl CudaServerKey {
             .iter_mut()
             .filter(|ct| !ct.block_carries_are_empty())
             .for_each(|ct| {
-                self.full_propagate_assign_async(&mut *ct, streams);
+                self.full_propagate_assign(&mut *ct, streams);
             });
 
         Some(self.unchecked_sum_ciphertexts_async(&ciphertexts, streams))
@@ -477,14 +456,14 @@ impl CudaServerKey {
             (true, false) => {
                 unsafe {
                     tmp_rhs = ct_right.duplicate_async(stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    self.full_propagate_assign(&mut tmp_rhs, stream);
                 }
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 unsafe {
                     tmp_lhs = ct_left.duplicate_async(stream);
-                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
+                    self.full_propagate_assign(&mut tmp_lhs, stream);
                 }
                 (&tmp_lhs, ct_right)
             }
@@ -493,8 +472,8 @@ impl CudaServerKey {
                     tmp_lhs = ct_left.duplicate_async(stream);
                     tmp_rhs = ct_right.duplicate_async(stream);
 
-                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    self.full_propagate_assign(&mut tmp_lhs, stream);
+                    self.full_propagate_assign(&mut tmp_rhs, stream);
                 }
 
                 (&tmp_lhs, &tmp_rhs)
@@ -664,14 +643,14 @@ impl CudaServerKey {
             (true, false) => {
                 unsafe {
                     tmp_rhs = ct_right.duplicate_async(stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    self.full_propagate_assign(&mut tmp_rhs, stream);
                 }
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 unsafe {
                     tmp_lhs = ct_left.duplicate_async(stream);
-                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
+                    self.full_propagate_assign(&mut tmp_lhs, stream);
                 }
                 (&tmp_lhs, ct_right)
             }
@@ -680,8 +659,8 @@ impl CudaServerKey {
                     tmp_lhs = ct_left.duplicate_async(stream);
                     tmp_rhs = ct_right.duplicate_async(stream);
 
-                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    self.full_propagate_assign(&mut tmp_lhs, stream);
+                    self.full_propagate_assign(&mut tmp_rhs, stream);
                 }
 
                 (&tmp_lhs, &tmp_rhs)
diff --git a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
index 0e77fda595..ea37d47dd2 100644
--- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
@@ -464,18 +464,18 @@ impl CudaServerKey {
                 (true, true) => (ct_left, ct_right),
                 (true, false) => {
                     tmp_rhs = ct_right.duplicate_async(streams);
-                    self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                    self.full_propagate_assign(&mut tmp_rhs, streams);
                     (ct_left, &tmp_rhs)
                 }
                 (false, true) => {
-                    self.full_propagate_assign_async(ct_left, streams);
+                    self.full_propagate_assign(ct_left, streams);
                     (ct_left, ct_right)
                 }
                 (false, false) => {
                     tmp_rhs = ct_right.duplicate_async(streams);
 
-                    self.full_propagate_assign_async(ct_left, streams);
-                    self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                    self.full_propagate_assign(ct_left, streams);
+                    self.full_propagate_assign(&mut tmp_rhs, streams);
                     (ct_left, &tmp_rhs)
                 }
             }
@@ -569,18 +569,18 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
-                self.full_propagate_assign_async(ct_left, streams);
+                self.full_propagate_assign(ct_left, streams);
                 (ct_left, ct_right)
             }
             (false, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(ct_left, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(ct_left, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
         };
@@ -674,18 +674,18 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
-                self.full_propagate_assign_async(ct_left, streams);
+                self.full_propagate_assign(ct_left, streams);
                 (ct_left, ct_right)
             }
             (false, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(ct_left, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(ct_left, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
         };
@@ -763,7 +763,7 @@ impl CudaServerKey {
         streams: &CudaStreams,
     ) {
         if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, streams);
+            self.full_propagate_assign(ct, streams);
         }
 
         self.unchecked_bitnot_assign_async(ct, streams);
diff --git a/tfhe/src/integer/gpu/server_key/radix/cmux.rs b/tfhe/src/integer/gpu/server_key/radix/cmux.rs
index 864a7914cd..421d66854d 100644
--- a/tfhe/src/integer/gpu/server_key/radix/cmux.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/cmux.rs
@@ -3,14 +3,10 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
-use crate::integer::gpu::{unchecked_cmux_integer_radix_kb_async, CudaServerKey, PBSType};
+use crate::integer::gpu::{unchecked_cmux_integer_radix_kb, CudaServerKey, PBSType};
 
 impl CudaServerKey {
-    /// # Safety
-    ///
-    /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until stream is synchronised
-    pub unsafe fn unchecked_if_then_else_async<T: CudaIntegerRadixCiphertext>(
+    pub fn unchecked_if_then_else<T: CudaIntegerRadixCiphertext>(
         &self,
         condition: &CudaBooleanBlock,
         true_ct: &T,
@@ -21,80 +17,70 @@ impl CudaServerKey {
         let mut result: T = self
             .create_trivial_zero_radix(true_ct.as_ref().d_blocks.lwe_ciphertext_count().0, stream);
 
-        match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_cmux_integer_radix_kb_async(
-                    stream,
-                    &mut result.as_mut().d_blocks.0.d_vec,
-                    &condition.as_ref().ciphertext.d_blocks.0.d_vec,
-                    &true_ct.as_ref().d_blocks.0.d_vec,
-                    &false_ct.as_ref().d_blocks.0.d_vec,
-                    &d_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_bsk.glwe_dimension,
-                    d_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count,
-                    d_bsk.decomp_base_log,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                );
-            }
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_cmux_integer_radix_kb_async(
-                    stream,
-                    &mut result.as_mut().d_blocks.0.d_vec,
-                    &condition.as_ref().ciphertext.d_blocks.0.d_vec,
-                    &true_ct.as_ref().d_blocks.0.d_vec,
-                    &false_ct.as_ref().d_blocks.0.d_vec,
-                    &d_multibit_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::MultiBit,
-                    d_multibit_bsk.grouping_factor,
-                );
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    unchecked_cmux_integer_radix_kb(
+                        stream,
+                        result.as_mut(),
+                        condition,
+                        true_ct.as_ref(),
+                        false_ct.as_ref(),
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        lwe_ciphertext_count.0 as u32,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    unchecked_cmux_integer_radix_kb(
+                        stream,
+                        result.as_mut(),
+                        condition,
+                        true_ct.as_ref(),
+                        false_ct.as_ref(),
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        lwe_ciphertext_count.0 as u32,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                    );
+                }
             }
         }
         result.as_mut().info = true_ct.as_ref().info.after_if_then_else();
 
         result
     }
-    pub fn unchecked_if_then_else<T: CudaIntegerRadixCiphertext>(
-        &self,
-        condition: &CudaBooleanBlock,
-        true_ct: &T,
-        false_ct: &T,
-        stream: &CudaStreams,
-    ) -> T {
-        let result =
-            unsafe { self.unchecked_if_then_else_async(condition, true_ct, false_ct, stream) };
-        stream.synchronize();
-        result
-    }
 
     pub fn if_then_else<T: CudaIntegerRadixCiphertext>(
         &self,
@@ -106,26 +92,22 @@ impl CudaServerKey {
         let mut tmp_true_ct;
         let mut tmp_false_ct;
 
-        let result = unsafe {
-            let true_ct = if true_ct.block_carries_are_empty() {
-                true_ct
-            } else {
-                tmp_true_ct = true_ct.duplicate_async(stream);
-                self.full_propagate_assign_async(&mut tmp_true_ct, stream);
-                &tmp_true_ct
-            };
-
-            let false_ct = if false_ct.block_carries_are_empty() {
-                false_ct
-            } else {
-                tmp_false_ct = false_ct.duplicate_async(stream);
-                self.full_propagate_assign_async(&mut tmp_false_ct, stream);
-                &tmp_false_ct
-            };
+        let true_ct = if true_ct.block_carries_are_empty() {
+            true_ct
+        } else {
+            tmp_true_ct = true_ct.duplicate(stream);
+            self.full_propagate_assign(&mut tmp_true_ct, stream);
+            &tmp_true_ct
+        };
 
-            self.unchecked_if_then_else_async(condition, true_ct, false_ct, stream)
+        let false_ct = if false_ct.block_carries_are_empty() {
+            false_ct
+        } else {
+            tmp_false_ct = false_ct.duplicate(stream);
+            self.full_propagate_assign(&mut tmp_false_ct, stream);
+            &tmp_false_ct
         };
-        stream.synchronize();
-        result
+
+        self.unchecked_if_then_else(condition, true_ct, false_ct, stream)
     }
 }
diff --git a/tfhe/src/integer/gpu/server_key/radix/comparison.rs b/tfhe/src/integer/gpu/server_key/radix/comparison.rs
index 26d728747d..41b8902d97 100644
--- a/tfhe/src/integer/gpu/server_key/radix/comparison.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/comparison.rs
@@ -285,20 +285,20 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, ct_right)
             }
             (false, false) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -379,20 +379,20 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, ct_right)
             }
             (false, false) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -558,20 +558,20 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, ct_right)
             }
             (false, false) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -666,20 +666,20 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, ct_right)
             }
             (false, false) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -790,20 +790,20 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, ct_right)
             }
             (false, false) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -914,20 +914,20 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, ct_right)
             }
             (false, false) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -1161,20 +1161,20 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, ct_right)
             }
             (false, false) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -1208,20 +1208,20 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, ct_right)
             }
             (false, false) => {
                 tmp_lhs = ct_left.duplicate_async(streams);
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
diff --git a/tfhe/src/integer/gpu/server_key/radix/div_mod.rs b/tfhe/src/integer/gpu/server_key/radix/div_mod.rs
index de1ce80a8d..4a7994edcc 100644
--- a/tfhe/src/integer/gpu/server_key/radix/div_mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/div_mod.rs
@@ -106,8 +106,8 @@ impl CudaServerKey {
     where
         T: CudaIntegerRadixCiphertext,
     {
-        let mut quotient = unsafe { numerator.duplicate_async(streams) };
-        let mut remainder = unsafe { numerator.duplicate_async(streams) };
+        let mut quotient = numerator.duplicate(streams);
+        let mut remainder = numerator.duplicate(streams);
 
         unsafe {
             self.unchecked_div_rem_assign_async(
@@ -135,22 +135,20 @@ impl CudaServerKey {
         ) {
             (true, true) => (numerator, divisor),
             (true, false) => {
-                tmp_divisor = unsafe { divisor.duplicate_async(streams) };
-                unsafe { self.full_propagate_assign_async(&mut tmp_divisor, streams) };
+                tmp_divisor = divisor.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_divisor, streams);
                 (numerator, &tmp_divisor)
             }
             (false, true) => {
-                tmp_numerator = unsafe { numerator.duplicate_async(streams) };
-                unsafe { self.full_propagate_assign_async(&mut tmp_numerator, streams) };
+                tmp_numerator = numerator.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_numerator, streams);
                 (&tmp_numerator, divisor)
             }
             (false, false) => {
-                tmp_divisor = unsafe { divisor.duplicate_async(streams) };
-                tmp_numerator = unsafe { numerator.duplicate_async(streams) };
-                unsafe {
-                    self.full_propagate_assign_async(&mut tmp_numerator, streams);
-                    self.full_propagate_assign_async(&mut tmp_divisor, streams);
-                }
+                tmp_divisor = divisor.duplicate(streams);
+                tmp_numerator = numerator.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_numerator, streams);
+                self.full_propagate_assign(&mut tmp_divisor, streams);
                 (&tmp_numerator, &tmp_divisor)
             }
         };
@@ -177,22 +175,20 @@ impl CudaServerKey {
         ) {
             (true, true) => (numerator, divisor),
             (true, false) => {
-                tmp_divisor = unsafe { divisor.duplicate_async(streams) };
-                unsafe { self.full_propagate_assign_async(&mut tmp_divisor, streams) };
+                tmp_divisor = divisor.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_divisor, streams);
                 (numerator, &tmp_divisor)
             }
             (false, true) => {
-                tmp_numerator = unsafe { numerator.duplicate_async(streams) };
-                unsafe { self.full_propagate_assign_async(&mut tmp_numerator, streams) };
+                tmp_numerator = numerator.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_numerator, streams);
                 (&tmp_numerator, divisor)
             }
             (false, false) => {
-                tmp_divisor = unsafe { divisor.duplicate_async(streams) };
-                tmp_numerator = unsafe { numerator.duplicate_async(streams) };
-                unsafe {
-                    self.full_propagate_assign_async(&mut tmp_numerator, streams);
-                    self.full_propagate_assign_async(&mut tmp_divisor, streams);
-                }
+                tmp_divisor = divisor.duplicate(streams);
+                tmp_numerator = numerator.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_numerator, streams);
+                self.full_propagate_assign(&mut tmp_divisor, streams);
                 (&tmp_numerator, &tmp_divisor)
             }
         };
diff --git a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
index 43186f2310..198673d95c 100644
--- a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
@@ -847,7 +847,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp, streams);
+            self.full_propagate_assign(&mut tmp, streams);
             &tmp
         };
         self.unchecked_trailing_zeros_async(ct, streams)
@@ -920,7 +920,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp, streams);
+            self.full_propagate_assign(&mut tmp, streams);
             &tmp
         };
         self.unchecked_trailing_ones_async(ct, streams)
@@ -993,7 +993,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp, streams);
+            self.full_propagate_assign(&mut tmp, streams);
             &tmp
         };
         self.unchecked_leading_zeros_async(ct, streams)
@@ -1066,7 +1066,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp, streams);
+            self.full_propagate_assign(&mut tmp, streams);
             &tmp
         };
         self.unchecked_leading_ones_async(ct, streams)
@@ -1132,7 +1132,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp, streams);
+            self.full_propagate_assign(&mut tmp, streams);
             &tmp
         };
 
@@ -1207,7 +1207,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp, streams);
+            self.full_propagate_assign(&mut tmp, streams);
             &tmp
         };
 
diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs
index 6c8031da2f..e4091abff7 100644
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -379,59 +379,57 @@ impl CudaServerKey {
         carry_out
     }
 
-    /// # Safety
-    ///
-    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
-    ///   not be dropped until streams is synchronized
-    pub(crate) unsafe fn full_propagate_assign_async<T: CudaIntegerRadixCiphertext>(
+    pub(crate) fn full_propagate_assign<T: CudaIntegerRadixCiphertext>(
         &self,
         ct: &mut T,
         streams: &CudaStreams,
     ) {
         let ciphertext = ct.as_mut();
         let num_blocks = ciphertext.d_blocks.lwe_ciphertext_count().0 as u32;
-        match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => {
-                full_propagate_assign_async(
-                    streams,
-                    &mut ciphertext.d_blocks.0.d_vec,
-                    &d_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    num_blocks,
-                    ciphertext.info.blocks.first().unwrap().message_modulus,
-                    ciphertext.info.blocks.first().unwrap().carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                );
-            }
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                full_propagate_assign_async(
-                    streams,
-                    &mut ciphertext.d_blocks.0.d_vec,
-                    &d_multibit_bsk.d_vec,
-                    &self.key_switching_key.d_vec,
-                    d_multibit_bsk.input_lwe_dimension(),
-                    d_multibit_bsk.glwe_dimension(),
-                    d_multibit_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count(),
-                    d_multibit_bsk.decomp_base_log(),
-                    num_blocks,
-                    ciphertext.info.blocks.first().unwrap().message_modulus,
-                    ciphertext.info.blocks.first().unwrap().carry_modulus,
-                    PBSType::MultiBit,
-                    d_multibit_bsk.grouping_factor,
-                );
-            }
-        };
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    full_propagate_assign_async(
+                        streams,
+                        &mut ciphertext.d_blocks.0.d_vec,
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        num_blocks,
+                        ciphertext.info.blocks.first().unwrap().message_modulus,
+                        ciphertext.info.blocks.first().unwrap().carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    full_propagate_assign_async(
+                        streams,
+                        &mut ciphertext.d_blocks.0.d_vec,
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        d_multibit_bsk.input_lwe_dimension(),
+                        d_multibit_bsk.glwe_dimension(),
+                        d_multibit_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count(),
+                        d_multibit_bsk.decomp_base_log(),
+                        num_blocks,
+                        ciphertext.info.blocks.first().unwrap().message_modulus,
+                        ciphertext.info.blocks.first().unwrap().carry_modulus,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                    );
+                }
+            };
+        }
         ciphertext.info.blocks.iter_mut().for_each(|b| {
             b.degree = Degree::new(b.message_modulus.0 - 1);
             b.noise_level = if b.noise_level == NoiseLevel::ZERO {
@@ -440,6 +438,7 @@ impl CudaServerKey {
                 NoiseLevel::NOMINAL
             };
         });
+        streams.synchronize();
     }
 
     /// Prepend trivial zero LSB blocks to an existing [`CudaUnsignedRadixCiphertext`] or
@@ -1347,7 +1346,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !source.block_carries_are_empty() {
-            self.full_propagate_assign_async(&mut source, streams);
+            self.full_propagate_assign(&mut source, streams);
         }
         let current_num_blocks = source.as_ref().info.blocks.len();
         if T::IS_SIGNED {
@@ -1453,7 +1452,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !source.block_carries_are_empty() {
-            self.full_propagate_assign_async(&mut source, streams);
+            self.full_propagate_assign(&mut source, streams);
         }
 
         let current_num_blocks = source.as_ref().info.blocks.len();
diff --git a/tfhe/src/integer/gpu/server_key/radix/mul.rs b/tfhe/src/integer/gpu/server_key/radix/mul.rs
index e9f568bbf0..ee27fbc370 100644
--- a/tfhe/src/integer/gpu/server_key/radix/mul.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mul.rs
@@ -214,18 +214,18 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
-                self.full_propagate_assign_async(ct_left, streams);
+                self.full_propagate_assign(ct_left, streams);
                 (ct_left, ct_right)
             }
             (false, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(ct_left, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(ct_left, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
         };
diff --git a/tfhe/src/integer/gpu/server_key/radix/neg.rs b/tfhe/src/integer/gpu/server_key/radix/neg.rs
index 6cd81e26ac..6e2d0d456e 100644
--- a/tfhe/src/integer/gpu/server_key/radix/neg.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs
@@ -142,7 +142,7 @@ impl CudaServerKey {
             ctxt
         } else {
             tmp_ctxt = ctxt.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_ctxt, streams);
+            self.full_propagate_assign(&mut tmp_ctxt, streams);
             &mut tmp_ctxt
         };
 
diff --git a/tfhe/src/integer/gpu/server_key/radix/rotate.rs b/tfhe/src/integer/gpu/server_key/radix/rotate.rs
index b7f6348454..042e06342c 100644
--- a/tfhe/src/integer/gpu/server_key/radix/rotate.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/rotate.rs
@@ -271,20 +271,20 @@ impl CudaServerKey {
             (true, true) => (ct, rotate),
             (true, false) => {
                 tmp_rhs = rotate.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, rotate)
             }
             (false, false) => {
                 tmp_lhs = ct.duplicate_async(streams);
                 tmp_rhs = rotate.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -316,20 +316,20 @@ impl CudaServerKey {
             (true, true) => (ct, rotate),
             (true, false) => {
                 tmp_rhs = rotate.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&mut tmp_lhs, rotate)
             }
             (false, false) => {
                 tmp_lhs = ct.duplicate_async(streams);
                 tmp_rhs = rotate.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&mut tmp_lhs, &tmp_rhs)
             }
         };
@@ -425,20 +425,20 @@ impl CudaServerKey {
             (true, true) => (ct, rotate),
             (true, false) => {
                 tmp_rhs = rotate.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, rotate)
             }
             (false, false) => {
                 tmp_lhs = ct.duplicate_async(streams);
                 tmp_rhs = rotate.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -470,20 +470,20 @@ impl CudaServerKey {
             (true, true) => (ct, rotate),
             (true, false) => {
                 tmp_rhs = rotate.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&mut tmp_lhs, rotate)
             }
             (false, false) => {
                 tmp_lhs = ct.duplicate_async(streams);
                 tmp_rhs = rotate.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&mut tmp_lhs, &tmp_rhs)
             }
         };
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
index 597d0e2e55..0d0fb8fc5e 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
@@ -60,7 +60,7 @@ impl CudaServerKey {
         Scalar: DecomposableInto<u8> + CastInto<u64>,
         T: CudaIntegerRadixCiphertext,
     {
-        let mut result = unsafe { ct.duplicate_async(streams) };
+        let mut result = ct.duplicate(streams);
         self.unchecked_scalar_add_assign(&mut result, scalar, streams);
         result
     }
@@ -166,7 +166,7 @@ impl CudaServerKey {
         Scalar: DecomposableInto<u8> + CastInto<u64>,
         T: CudaIntegerRadixCiphertext,
     {
-        let mut result = unsafe { ct.duplicate_async(streams) };
+        let mut result = ct.duplicate(streams);
         self.scalar_add_assign(&mut result, scalar, streams);
         result
     }
@@ -185,7 +185,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, streams);
+            self.full_propagate_assign(ct, streams);
         };
 
         self.unchecked_scalar_add_assign_async(ct, scalar, streams);
@@ -213,9 +213,7 @@ impl CudaServerKey {
         Scalar: DecomposableInto<u8> + CastInto<u64>,
     {
         let mut result;
-        unsafe {
-            result = ct_left.duplicate_async(stream);
-        }
+        result = ct_left.duplicate(stream);
         let overflowed = self.unsigned_overflowing_scalar_add_assign(&mut result, scalar, stream);
         (result, overflowed)
     }
@@ -229,10 +227,8 @@ impl CudaServerKey {
     where
         Scalar: DecomposableInto<u8> + CastInto<u64>,
     {
-        unsafe {
-            if !ct_left.block_carries_are_empty() {
-                self.full_propagate_assign_async(ct_left, stream);
-            }
+        if !ct_left.block_carries_are_empty() {
+            self.full_propagate_assign(ct_left, stream);
         }
         self.unchecked_unsigned_overflowing_scalar_add_assign(ct_left, scalar, stream)
     }
@@ -247,9 +243,7 @@ impl CudaServerKey {
         Scalar: DecomposableInto<u8> + CastInto<u64>,
     {
         let mut result;
-        unsafe {
-            result = ct_left.duplicate_async(stream);
-        }
+        result = ct_left.duplicate(stream);
         let overflowed =
             self.unchecked_unsigned_overflowing_scalar_add_assign(&mut result, scalar, stream);
         (result, overflowed)
@@ -334,11 +328,9 @@ impl CudaServerKey {
         Scalar: SignedNumeric + DecomposableInto<u64> + CastInto<u64>,
     {
         let mut tmp_lhs;
-        unsafe {
-            tmp_lhs = ct_left.duplicate_async(streams);
-            if !tmp_lhs.block_carries_are_empty() {
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-            }
+        tmp_lhs = ct_left.duplicate(streams);
+        if !tmp_lhs.block_carries_are_empty() {
+            self.full_propagate_assign(&mut tmp_lhs, streams);
         }
 
         let trivial: CudaSignedRadixCiphertext = self.create_trivial_radix(
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
index 337abdd928..4b86abcee5 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
@@ -193,7 +193,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, streams);
+            self.full_propagate_assign(ct, streams);
         }
         self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarAnd, streams);
         ct.as_mut().info = ct.as_ref().info.after_scalar_bitand(rhs);
@@ -234,7 +234,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, streams);
+            self.full_propagate_assign(ct, streams);
         }
         self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarOr, streams);
         ct.as_mut().info = ct.as_ref().info.after_scalar_bitor(rhs);
@@ -275,7 +275,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, streams);
+            self.full_propagate_assign(ct, streams);
         }
         self.unchecked_scalar_bitop_assign_async(ct, rhs, BitOpType::ScalarXor, streams);
         ct.as_mut().info = ct.as_ref().info.after_scalar_bitxor(rhs);
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
index 848f85cc26..f436b986b0 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
@@ -606,7 +606,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_lhs = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_lhs, streams);
+            self.full_propagate_assign(&mut tmp_lhs, streams);
             &tmp_lhs
         };
 
@@ -688,7 +688,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_lhs = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_lhs, streams);
+            self.full_propagate_assign(&mut tmp_lhs, streams);
             &tmp_lhs
         };
 
@@ -929,7 +929,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_lhs = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_lhs, streams);
+            self.full_propagate_assign(&mut tmp_lhs, streams);
             &tmp_lhs
         };
 
@@ -970,7 +970,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_lhs = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_lhs, streams);
+            self.full_propagate_assign(&mut tmp_lhs, streams);
             &tmp_lhs
         };
 
@@ -1011,7 +1011,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_lhs = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_lhs, streams);
+            self.full_propagate_assign(&mut tmp_lhs, streams);
             &tmp_lhs
         };
 
@@ -1051,7 +1051,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_lhs = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_lhs, streams);
+            self.full_propagate_assign(&mut tmp_lhs, streams);
             &tmp_lhs
         };
 
@@ -1156,7 +1156,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_lhs = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_lhs, streams);
+            self.full_propagate_assign(&mut tmp_lhs, streams);
             &tmp_lhs
         };
 
@@ -1192,7 +1192,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_lhs = ct.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_lhs, streams);
+            self.full_propagate_assign(&mut tmp_lhs, streams);
             &tmp_lhs
         };
 
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs
index fee26b4625..5524a1ccd2 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs
@@ -296,7 +296,7 @@ impl CudaServerKey {
             numerator
         } else {
             tmp_numerator = numerator.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_numerator, streams);
+            self.full_propagate_assign(&mut tmp_numerator, streams);
             &tmp_numerator
         };
 
@@ -425,7 +425,7 @@ impl CudaServerKey {
             numerator
         } else {
             tmp_numerator = numerator.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_numerator, streams);
+            self.full_propagate_assign(&mut tmp_numerator, streams);
             &tmp_numerator
         };
 
@@ -536,7 +536,7 @@ impl CudaServerKey {
             numerator
         } else {
             tmp_numerator = numerator.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_numerator, streams);
+            self.full_propagate_assign(&mut tmp_numerator, streams);
             &tmp_numerator
         };
 
@@ -776,7 +776,7 @@ impl CudaServerKey {
             numerator
         } else {
             tmp_numerator = numerator.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_numerator, streams);
+            self.full_propagate_assign(&mut tmp_numerator, streams);
             &tmp_numerator
         };
 
@@ -894,7 +894,7 @@ impl CudaServerKey {
             numerator
         } else {
             tmp_numerator = numerator.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_numerator, streams);
+            self.full_propagate_assign(&mut tmp_numerator, streams);
             &tmp_numerator
         };
 
@@ -1006,7 +1006,7 @@ impl CudaServerKey {
             numerator
         } else {
             tmp_numerator = numerator.duplicate_async(streams);
-            self.full_propagate_assign_async(&mut tmp_numerator, streams);
+            self.full_propagate_assign(&mut tmp_numerator, streams);
             &tmp_numerator
         };
 
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
index 7d0383d162..0a87134ee0 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
@@ -239,7 +239,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, streams);
+            self.full_propagate_assign(ct, streams);
         };
 
         self.unchecked_scalar_mul_assign_async(ct, scalar, streams);
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs
index 240e2144f4..1bddc7ec51 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs
@@ -231,9 +231,7 @@ impl CudaServerKey {
         u32: CastFrom<Scalar>,
     {
         if !ct.block_carries_are_empty() {
-            unsafe {
-                self.full_propagate_assign_async(ct, stream);
-            }
+            self.full_propagate_assign(ct, stream);
         }
 
         unsafe { self.unchecked_scalar_rotate_left_assign_async(ct, n, stream) };
@@ -247,9 +245,7 @@ impl CudaServerKey {
         u32: CastFrom<Scalar>,
     {
         if !ct.block_carries_are_empty() {
-            unsafe {
-                self.full_propagate_assign_async(ct, stream);
-            }
+            self.full_propagate_assign(ct, stream);
         }
 
         unsafe { self.unchecked_scalar_rotate_right_assign_async(ct, n, stream) };
@@ -262,7 +258,7 @@ impl CudaServerKey {
         Scalar: CastFrom<u32>,
         u32: CastFrom<Scalar>,
     {
-        let mut result = unsafe { ct.duplicate_async(stream) };
+        let mut result = ct.duplicate(stream);
         self.scalar_rotate_left_assign(&mut result, shift, stream);
         result
     }
@@ -273,7 +269,7 @@ impl CudaServerKey {
         Scalar: CastFrom<u32>,
         u32: CastFrom<Scalar>,
     {
-        let mut result = unsafe { ct.duplicate_async(stream) };
+        let mut result = ct.duplicate(stream);
         self.scalar_rotate_right_assign(&mut result, shift, stream);
         result
     }
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs
index a50a04e063..d27074f56d 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs
@@ -371,7 +371,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, streams);
+            self.full_propagate_assign(ct, streams);
         }
 
         self.unchecked_scalar_right_shift_assign_async(ct, shift, streams);
@@ -459,7 +459,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, streams);
+            self.full_propagate_assign(ct, streams);
         }
 
         self.unchecked_scalar_left_shift_assign_async(ct, shift, streams);
@@ -542,11 +542,11 @@ impl CudaServerKey {
         u32: CastFrom<Scalar>,
         T: CudaIntegerRadixCiphertext,
     {
-        unsafe {
-            if !ct.block_carries_are_empty() {
-                self.full_propagate_assign_async(ct, streams);
-            }
+        if !ct.block_carries_are_empty() {
+            self.full_propagate_assign(ct, streams);
+        }
 
+        unsafe {
             self.unchecked_scalar_left_shift_assign_async(ct, shift, streams);
         };
         streams.synchronize();
@@ -562,11 +562,11 @@ impl CudaServerKey {
         u32: CastFrom<Scalar>,
         T: CudaIntegerRadixCiphertext,
     {
-        unsafe {
-            if !ct.block_carries_are_empty() {
-                self.full_propagate_assign_async(ct, streams);
-            }
+        if !ct.block_carries_are_empty() {
+            self.full_propagate_assign(ct, streams);
+        }
 
+        unsafe {
             self.unchecked_scalar_right_shift_assign_async(ct, shift, streams);
         };
         streams.synchronize();
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
index cefdf2dfde..9074d681c2 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_sub.rs
@@ -155,7 +155,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if !ct.block_carries_are_empty() {
-            self.full_propagate_assign_async(ct, streams);
+            self.full_propagate_assign(ct, streams);
         };
 
         self.unchecked_scalar_sub_assign_async(ct, scalar, streams);
@@ -221,7 +221,7 @@ impl CudaServerKey {
         unsafe {
             tmp_lhs = ct_left.duplicate_async(streams);
             if !tmp_lhs.block_carries_are_empty() {
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
             }
         }
 
diff --git a/tfhe/src/integer/gpu/server_key/radix/shift.rs b/tfhe/src/integer/gpu/server_key/radix/shift.rs
index 1a7f58f69d..7115695a0d 100644
--- a/tfhe/src/integer/gpu/server_key/radix/shift.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/shift.rs
@@ -267,20 +267,20 @@ impl CudaServerKey {
             (true, true) => (ct, shift),
             (true, false) => {
                 tmp_rhs = shift.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, shift)
             }
             (false, false) => {
                 tmp_lhs = ct.duplicate_async(streams);
                 tmp_rhs = shift.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -312,20 +312,20 @@ impl CudaServerKey {
             (true, true) => (ct, shift),
             (true, false) => {
                 tmp_rhs = shift.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&mut tmp_lhs, shift)
             }
             (false, false) => {
                 tmp_lhs = ct.duplicate_async(streams);
                 tmp_rhs = shift.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&mut tmp_lhs, &tmp_rhs)
             }
         };
@@ -420,20 +420,20 @@ impl CudaServerKey {
             (true, true) => (ct, shift),
             (true, false) => {
                 tmp_rhs = shift.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&tmp_lhs, shift)
             }
             (false, false) => {
                 tmp_lhs = ct.duplicate_async(streams);
                 tmp_rhs = shift.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&tmp_lhs, &tmp_rhs)
             }
         };
@@ -465,20 +465,20 @@ impl CudaServerKey {
             (true, true) => (ct, shift),
             (true, false) => {
                 tmp_rhs = shift.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct, &tmp_rhs)
             }
             (false, true) => {
                 tmp_lhs = ct.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
                 (&mut tmp_lhs, shift)
             }
             (false, false) => {
                 tmp_lhs = ct.duplicate_async(streams);
                 tmp_rhs = shift.duplicate_async(streams);
 
-                self.full_propagate_assign_async(&mut tmp_lhs, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_lhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (&mut tmp_lhs, &tmp_rhs)
             }
         };
diff --git a/tfhe/src/integer/gpu/server_key/radix/sub.rs b/tfhe/src/integer/gpu/server_key/radix/sub.rs
index 6e2bbfa990..ce6120b116 100644
--- a/tfhe/src/integer/gpu/server_key/radix/sub.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/sub.rs
@@ -94,7 +94,7 @@ impl CudaServerKey {
         streams: &CudaStreams,
     ) {
         let neg = self.unchecked_neg_async(ct_right, streams);
-        self.unchecked_add_assign_async(ct_left, &neg, streams);
+        self.unchecked_add_assign(ct_left, &neg, streams);
     }
 
     /// Computes homomorphically a subtraction between two ciphertexts encrypting integer values.
@@ -257,18 +257,18 @@ impl CudaServerKey {
             (true, true) => (ct_left, ct_right),
             (true, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
-                self.full_propagate_assign_async(ct_left, streams);
+                self.full_propagate_assign(ct_left, streams);
                 (ct_left, ct_right)
             }
             (false, false) => {
                 tmp_rhs = ct_right.duplicate_async(streams);
 
-                self.full_propagate_assign_async(ct_left, streams);
-                self.full_propagate_assign_async(&mut tmp_rhs, streams);
+                self.full_propagate_assign(ct_left, streams);
+                self.full_propagate_assign(&mut tmp_rhs, streams);
                 (ct_left, &tmp_rhs)
             }
         };
@@ -299,14 +299,14 @@ impl CudaServerKey {
             (true, false) => {
                 unsafe {
                     tmp_rhs = ct_right.duplicate_async(stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    self.full_propagate_assign(&mut tmp_rhs, stream);
                 }
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 unsafe {
                     tmp_lhs = ct_left.duplicate_async(stream);
-                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
+                    self.full_propagate_assign(&mut tmp_lhs, stream);
                 }
                 (&tmp_lhs, ct_right)
             }
@@ -315,8 +315,8 @@ impl CudaServerKey {
                     tmp_lhs = ct_left.duplicate_async(stream);
                     tmp_rhs = ct_right.duplicate_async(stream);
 
-                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    self.full_propagate_assign(&mut tmp_lhs, stream);
+                    self.full_propagate_assign(&mut tmp_rhs, stream);
                 }
 
                 (&tmp_lhs, &tmp_rhs)
@@ -521,14 +521,14 @@ impl CudaServerKey {
             (true, false) => {
                 unsafe {
                     tmp_rhs = ct_right.duplicate_async(stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    self.full_propagate_assign(&mut tmp_rhs, stream);
                 }
                 (ct_left, &tmp_rhs)
             }
             (false, true) => {
                 unsafe {
                     tmp_lhs = ct_left.duplicate_async(stream);
-                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
+                    self.full_propagate_assign(&mut tmp_lhs, stream);
                 }
                 (&tmp_lhs, ct_right)
             }
@@ -537,8 +537,8 @@ impl CudaServerKey {
                     tmp_lhs = ct_left.duplicate_async(stream);
                     tmp_rhs = ct_right.duplicate_async(stream);
 
-                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
-                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
+                    self.full_propagate_assign(&mut tmp_lhs, stream);
+                    self.full_propagate_assign(&mut tmp_rhs, stream);
                 }
 
                 (&tmp_lhs, &tmp_rhs)
diff --git a/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs b/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs
index a0433ebbe6..3feac0d287 100644
--- a/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/vector_comparisons.rs
@@ -258,11 +258,9 @@ impl CudaServerKey {
         let lhs = if lhs.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in lhs.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_lhs.push(temp_ct);
             }
@@ -275,11 +273,9 @@ impl CudaServerKey {
         let rhs = if rhs.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in rhs.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_rhs.push(temp_ct);
             }
@@ -413,11 +409,9 @@ impl CudaServerKey {
         let lhs = if lhs.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in lhs.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_lhs.push(temp_ct);
             }
@@ -430,11 +424,9 @@ impl CudaServerKey {
         let rhs = if rhs.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in rhs.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_rhs.push(temp_ct);
             }
diff --git a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
index 56ff9642b3..7bbb3b8be8 100644
--- a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
@@ -250,9 +250,7 @@ impl CudaServerKey {
             self.unchecked_match_value(ct, matches, streams)
         } else {
             let mut clone = ct.duplicate(streams);
-            unsafe {
-                self.full_propagate_assign_async(&mut clone, streams);
-            }
+            self.full_propagate_assign(&mut clone, streams);
             self.unchecked_match_value(&clone, matches, streams)
         }
     }
@@ -366,9 +364,7 @@ impl CudaServerKey {
             self.unchecked_match_value_or(ct, matches, or_value, streams)
         } else {
             let mut clone = ct.duplicate(streams);
-            unsafe {
-                self.full_propagate_assign_async(&mut clone, streams);
-            }
+            self.full_propagate_assign(&mut clone, streams);
             self.unchecked_match_value_or(&clone, matches, or_value, streams)
         }
     }
@@ -446,11 +442,9 @@ impl CudaServerKey {
         let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in cts.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_cts.push(temp_ct);
             }
@@ -464,9 +458,7 @@ impl CudaServerKey {
             value
         } else {
             tmp_value = value.duplicate(streams);
-            unsafe {
-                self.full_propagate_assign_async(&mut tmp_value, streams);
-            }
+            self.full_propagate_assign(&mut tmp_value, streams);
             &tmp_value
         };
 
@@ -550,11 +542,9 @@ impl CudaServerKey {
         let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in cts.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_cts.push(temp_ct);
             }
@@ -642,9 +632,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_ct = ct.duplicate(streams);
-            unsafe {
-                self.full_propagate_assign_async(&mut tmp_ct, streams);
-            }
+            self.full_propagate_assign(&mut tmp_ct, streams);
             &tmp_ct
         };
         self.unchecked_is_in_clears(ct, clears, streams)
@@ -744,9 +732,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_ct = ct.duplicate(streams);
-            unsafe {
-                self.full_propagate_assign_async(&mut tmp_ct, streams);
-            }
+            self.full_propagate_assign(&mut tmp_ct, streams);
             streams.synchronize();
             &tmp_ct
         };
@@ -873,9 +859,7 @@ impl CudaServerKey {
             ct
         } else {
             tmp_ct = ct.duplicate(streams);
-            unsafe {
-                self.full_propagate_assign_async(&mut tmp_ct, streams);
-            }
+            self.full_propagate_assign(&mut tmp_ct, streams);
             streams.synchronize();
             &tmp_ct
         };
@@ -978,11 +962,9 @@ impl CudaServerKey {
         let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in cts.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_cts.push(temp_ct);
             }
@@ -996,9 +978,7 @@ impl CudaServerKey {
             value
         } else {
             tmp_value = value.duplicate(streams);
-            unsafe {
-                self.full_propagate_assign_async(&mut tmp_value, streams);
-            }
+            self.full_propagate_assign(&mut tmp_value, streams);
             &tmp_value
         };
         self.unchecked_index_of(cts, value, streams)
@@ -1100,11 +1080,9 @@ impl CudaServerKey {
         let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in cts.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_cts.push(temp_ct);
             }
@@ -1231,11 +1209,9 @@ impl CudaServerKey {
         let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in cts.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_cts.push(temp_ct);
             }
@@ -1365,11 +1341,9 @@ impl CudaServerKey {
         let cts = if cts.iter().any(|ct| !ct.block_carries_are_empty()) {
             // Need a way to parallelize this step
             for ct in cts.iter() {
-                let mut temp_ct = unsafe { ct.duplicate_async(streams) };
+                let mut temp_ct = ct.duplicate(streams);
                 if !temp_ct.block_carries_are_empty() {
-                    unsafe {
-                        self.full_propagate_assign_async(&mut temp_ct, streams);
-                    }
+                    self.full_propagate_assign(&mut temp_ct, streams);
                 }
                 tmp_cts.push(temp_ct);
             }
@@ -1383,9 +1357,7 @@ impl CudaServerKey {
             value
         } else {
             tmp_value = value.duplicate(streams);
-            unsafe {
-                self.full_propagate_assign_async(&mut tmp_value, streams);
-            }
+            self.full_propagate_assign(&mut tmp_value, streams);
             &tmp_value
         };
         self.unchecked_first_index_of(cts, value, streams)
@@ -1640,7 +1612,7 @@ impl CudaServerKey {
                     streams,
                 );
             }
-            let mut temp = unsafe { aggregated_vector.duplicate_async(streams) };
+            let mut temp = aggregated_vector.duplicate(streams);
             let mut aggregated_mut_slice = aggregated_vector
                 .as_mut()
                 .d_blocks
@@ -1712,13 +1684,11 @@ impl CudaServerKey {
         let last_chunk_size = one_hot_vector.len() - (num_chunks - 1) * chunk_size;
         for ct_idx in 0..last_chunk_size {
             let one_hot_idx = (num_chunks - 1) * chunk_size + ct_idx;
-            unsafe {
-                self.unchecked_add_assign_async(
-                    &mut aggregated_vector,
-                    &one_hot_vector[one_hot_idx],
-                    streams,
-                );
-            }
+            self.unchecked_add_assign(
+                &mut aggregated_vector,
+                &one_hot_vector[one_hot_idx],
+                streams,
+            );
         }
 
         let message_extract_lut =
@@ -1745,7 +1715,7 @@ impl CudaServerKey {
             .as_mut_slice(0..lwe_size * num_ct_blocks, 0)
             .unwrap();
         unsafe {
-            let mut temp = aggregated_vector.duplicate_async(streams);
+            let mut temp = aggregated_vector.duplicate(streams);
             let aggregated_slice = temp
                 .as_mut()
                 .d_blocks
@@ -1805,7 +1775,7 @@ impl CudaServerKey {
             }
         }
         unsafe {
-            let mut temp = aggregated_vector.duplicate_async(streams);
+            let mut temp = aggregated_vector.duplicate(streams);
             let aggregated_slice = temp
                 .as_mut()
                 .d_blocks
@@ -1941,7 +1911,7 @@ impl CudaServerKey {
         let mut first_true: T =
             unsafe { self.create_trivial_zero_radix_async(num_ct_blocks, streams) };
 
-        let mut clone_ct = unsafe { values.duplicate_async(streams) };
+        let mut clone_ct = values.duplicate(streams);
         let mut slice_in = clone_ct
             .as_mut()
             .d_blocks
@@ -2022,7 +1992,7 @@ impl CudaServerKey {
             }
         });
 
-        let cloned_ct = unsafe { first_true.duplicate_async(streams) };
+        let cloned_ct = first_true.duplicate(streams);
         let slice_in_final = cloned_ct
             .as_ref()
             .d_blocks
diff --git a/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs b/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs
index 93a63d6699..0790444f32 100644
--- a/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs
@@ -3,4 +3,4 @@ pub(crate) mod test_random_op_sequence;
 pub(crate) mod test_signed_erc20;
 pub(crate) mod test_signed_random_op_sequence;
 pub(crate) const NB_CTXT_LONG_RUN: usize = 32;
-pub(crate) const NB_TESTS_LONG_RUN: usize = 20000;
+pub(crate) const NB_TESTS_LONG_RUN: usize = 5;
diff --git a/tfhe/src/shortint/ciphertext/common.rs b/tfhe/src/shortint/ciphertext/common.rs
index 2ae8b23f07..74f04e3e34 100644
--- a/tfhe/src/shortint/ciphertext/common.rs
+++ b/tfhe/src/shortint/ciphertext/common.rs
@@ -65,7 +65,7 @@ impl MaxNoiseLevel {
     Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Serialize, Deserialize, Versionize,
 )]
 #[versionize(NoiseLevelVersions)]
-pub struct NoiseLevel(u64);
+pub struct NoiseLevel(pub(crate) u64);
 
 impl NoiseLevel {
     pub const NOMINAL: Self = Self(1);
@@ -148,7 +148,7 @@ impl MaxDegree {
     Debug, PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Serialize, Deserialize, Versionize,
 )]
 #[versionize(DegreeVersions)]
-pub struct Degree(pub(super) u64);
+pub struct Degree(pub(crate) u64);
 
 impl Degree {
     pub fn new(degree: u64) -> Self {