From 9c29d8d216449b3fc5ced63ee22513c898f205c9 Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Wed, 17 Jul 2024 17:14:54 -0300
Subject: [PATCH] feat(gpu): implement CompressedCudaCiphertextList, and public
 functional packing keyswitch

---
 .../tfhe-cuda-backend/cuda/include/integer.h  |  59 ++++
 .../cuda/include/keyswitch.h                  |  14 +
 .../tfhe-cuda-backend/cuda/src/CMakeLists.txt |  14 -
 .../cuda/src/crypto/keyswitch.cu              |  56 +++-
 .../cuda/src/crypto/keyswitch.cuh             | 183 ++++++++++-
 .../cuda/src/crypto/torus.cuh                 |  31 +-
 .../src/integer/compression/compression.cu    |  50 +++
 .../src/integer/compression/compression.cuh   |  65 ++++
 .../cuda/src/linearalgebra/multiplication.cu  |  24 +-
 .../cuda/src/linearalgebra/multiplication.cuh |  44 ++-
 .../pbs/programmable_bootstrap_amortized.cuh  |   8 +-
 .../pbs/programmable_bootstrap_cg_classic.cuh |   6 +-
 .../programmable_bootstrap_cg_multibit.cuh    |   4 +-
 .../pbs/programmable_bootstrap_classic.cuh    |   9 +-
 .../pbs/programmable_bootstrap_multibit.cuh   |   8 +-
 .../programmable_bootstrap_tbc_classic.cuh    |   9 +-
 .../programmable_bootstrap_tbc_multibit.cuh   |   4 +-
 .../cuda/src/polynomial/polynomial_math.cuh   |  44 ++-
 backends/tfhe-cuda-backend/src/cuda_bind.rs   | 101 +++++-
 .../gpu/algorithms/lwe_packing_keyswitch.rs   |  62 ++++
 tfhe/src/core_crypto/gpu/algorithms/mod.rs    |   6 +-
 .../algorithms/test/lwe_packing_keyswitch.rs  | 239 ++++++++++++++
 .../core_crypto/gpu/algorithms/test/mod.rs    |   9 +
 .../gpu/entities/lwe_ciphertext_list.rs       |  63 +++-
 .../gpu/entities/lwe_packing_keyswitch_key.rs |  87 +++++
 tfhe/src/core_crypto/gpu/entities/mod.rs      |   1 +
 tfhe/src/core_crypto/gpu/mod.rs               |  66 ++++
 tfhe/src/integer/client_key/radix.rs          |  32 ++
 .../ciphertext/compressed_ciphertext_list.rs  | 196 ++++++++++++
 tfhe/src/integer/gpu/ciphertext/mod.rs        |   1 +
 tfhe/src/integer/gpu/list_compression/mod.rs  |   1 +
 .../gpu/list_compression/server_keys.rs       | 300 ++++++++++++++++++
 tfhe/src/integer/gpu/mod.rs                   | 161 +++++++++-
 33 files changed, 1873 insertions(+), 84 deletions(-)
 create mode 100644 backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
 create mode 100644 backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
 create mode 100644 tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
 create mode 100644 tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs
 create mode 100644 tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs
 create mode 100644 tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
 create mode 100644 tfhe/src/integer/gpu/list_compression/mod.rs
 create mode 100644 tfhe/src/integer/gpu/list_compression/server_keys.rs

diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index df4eb39df0..77515a641f 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -36,6 +36,11 @@ enum COMPARISON_TYPE {
   MIN = 7,
 };
 
+enum COMPRESSION_MODE {
+  COMPRESS = 0,
+  DECOMPRESS = 1,
+};
+
 enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
 
 enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
@@ -203,6 +208,29 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
                                      uint32_t gpu_count, int8_t **mem_ptr_void);
 
+void scratch_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+    bool allocate_gpu_memory);
+
+void cuda_compression_compress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
+    int8_t *mem_ptr);
+
+void cuda_compression_decompress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
+    int8_t *mem_ptr);
+
+void cleanup_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
 void scratch_cuda_integer_radix_bitop_kb_64(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
     uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -792,6 +820,37 @@ template <typename Torus> struct int_radix_lut {
   }
 };
 
+template <typename Torus> struct int_compression {
+  COMPRESSION_MODE mode;
+  int_radix_params params;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+
+  Torus *tmp_lwe_shifted;
+
+  int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                  uint32_t gpu_count, int_radix_params params,
+                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
+                  uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+                  bool allocate_gpu_memory) {
+    this->mode = mode;
+    this->params = params;
+    this->lwe_per_glwe = lwe_per_glwe;
+    this->storage_log_modulus = storage_log_modulus;
+
+    if (allocate_gpu_memory) {
+      int glwe_accumulator_size =
+          (params.glwe_dimension + 1) * params.polynomial_size;
+      tmp_lwe_shifted = (Torus *)cuda_malloc_async(
+          num_radix_blocks * (params.big_lwe_dimension + 1), streams[0],
+          gpu_indexes[0]);
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(tmp_lwe_shifted, streams[0], gpu_indexes[0]);
+  }
+};
 template <typename Torus> struct int_bit_extract_luts_buffer {
   int_radix_params params;
   int_radix_lut<Torus> *lut;
diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
index 924ec131f9..08edc2b8e5 100644
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -16,6 +16,20 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+
+void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, uint32_t gpu_index,
+                                      void *glwe_array_out, void *lwe_array_in,
+                                      void *fp_ksk_array,
+                                      uint32_t input_lwe_dimension,
+                                      uint32_t output_glwe_dimension,
+                                      uint32_t output_polynomial_size,
+                                      uint32_t base_log, uint32_t level_count);
+
+void cuda_fp_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
 }
 
 #endif // CNCRT_KS_H_
diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
index 3190501fc5..5fec699fbc 100644
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,17 +1,3 @@
-set(SOURCES
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
index 524a1fa45e..5d890a6a6e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t *>(lwe_output_indexes),
@@ -48,3 +48,55 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
       static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
       lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
+
+/* Perform functional packing keyswitch on a batch of 64 bits input LWE
+ * ciphertexts.
+ *
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `glwe_array_out`: output batch of keyswitched ciphertexts
+ * - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
+ * lwe_dimension_in mask values + 1 body value
+ *  - `fp_ksk_array`: the functional packing keyswitch keys to be used in the
+ * operation
+ *  - `base log`: the log of the base used in the decomposition (should be the
+ * one used to create the ksk)
+ *  - `level_count`: the number of levels used in the decomposition (should be
+ * the one used to  create the fp_ksks).
+ *  - `number_of_input_lwe`: the number of inputs
+ *  - `number_of_keys`: the number of fp_ksks
+ *
+ * This function calls a wrapper to a device kernel that performs the functional
+ * packing keyswitch.
+ */
+void cuda_fp_keyswitch_lwe_to_glwe_64(void *stream, uint32_t gpu_index,
+                                      void *glwe_array_out, void *lwe_array_in,
+                                      void *fp_ksk_array,
+                                      uint32_t input_lwe_dimension,
+                                      uint32_t output_glwe_dimension,
+                                      uint32_t output_polynomial_size,
+                                      uint32_t base_log, uint32_t level_count) {
+
+  host_fp_keyswitch_lwe_to_glwe(static_cast<cudaStream_t>(stream), gpu_index,
+                                static_cast<uint64_t *>(glwe_array_out),
+                                static_cast<uint64_t *>(lwe_array_in),
+                                static_cast<uint64_t *>(fp_ksk_array),
+                                input_lwe_dimension, output_glwe_dimension,
+                                output_polynomial_size, base_log, level_count);
+}
+
+void cuda_fp_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
+
+  host_fp_keyswitch_lwe_list_to_glwe(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(fp_ksk_array), input_lwe_dimension,
+      output_glwe_dimension, output_polynomial_size, base_log, level_count,
+      num_lwes);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
index db78104a98..53f4d667b0 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -7,6 +7,7 @@
 #include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
+#include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>
@@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
 }
 
 template <typename Torus>
-__host__ void cuda_keyswitch_lwe_ciphertext_vector(
+__host__ void host_keyswitch_lwe_ciphertext_vector(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
     Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
@@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
         GET_VARIANT_ELEMENT(lwe_input_indexes, i);
 
     // Compute Keyswitch
-    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
+    host_keyswitch_lwe_ciphertext_vector<Torus>(
         streams[i], gpu_indexes[i], current_lwe_array_out,
         current_lwe_output_indexes, current_lwe_array_in,
         current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
@@ -154,4 +155,182 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
   }
 }
 
+// chunk_count = glwe_size * polynomial_size / threads.
+// each threads will responsible to process only lwe_size times multiplication
+template <typename Torus>
+__device__ void keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+    Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count) {
+
+  extern __shared__ int8_t sharedmem[];
+
+  // result accumulator, shared memory is used because of frequent access
+  Torus *local_glwe_chunk = (Torus *)sharedmem;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int shmem_index = threadIdx.x;
+  // the output_glwe is split in chunks and each x-block takes one of them
+  size_t chunk_id = blockIdx.x;
+  size_t coef_per_block = blockDim.x;
+
+  // dimensions
+  size_t glwe_size = (glwe_dimension + 1);
+  size_t lwe_size = (lwe_dimension_in + 1);
+  size_t ksk_size = lwe_size * level_count * glwe_size * polynomial_size;
+  // number of coefficients inside fp-ksk block for each lwe_input coefficient
+  size_t ksk_block_size = glwe_size * polynomial_size * level_count;
+
+  // initialize accumulator to 0
+  local_glwe_chunk[shmem_index] =
+      SEL(0, lwe_in[lwe_dimension_in], tid == glwe_dimension * polynomial_size);
+
+  // Iterate through all lwe elements
+  for (int i = 0; i < lwe_dimension_in; i++) {
+    // Round and prepare decomposition
+    Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
+
+    Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
+    Torus mod_b_mask = (1ll << base_log) - 1ll;
+
+    // block of key for current lwe coefficient (cur_input_lwe[i])
+    auto ksk_block = &fp_ksk[i * ksk_block_size];
+    for (int j = 0; j < level_count; j++) {
+      auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
+      // Iterate through each level and multiply by the ksk piece
+      auto ksk_glwe_chunk = &ksk_glwe[chunk_id * coef_per_block];
+      Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
+      local_glwe_chunk[shmem_index] -= decomposed * ksk_glwe_chunk[shmem_index];
+    }
+  }
+
+  // Persist
+  glwe_out[tid] = local_glwe_chunk[shmem_index];
+}
+
+// public functional packing keyswitch
+//
+// chunk_count = glwe_size * polynomial_size / threads.
+template <typename Torus>
+__global__ void
+fp_keyswitch_lwe_to_glwe(Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk,
+                         uint32_t lwe_dimension_in, uint32_t glwe_dimension,
+                         uint32_t polynomial_size, uint32_t base_log,
+                         uint32_t level_count) {
+
+  keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+      glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
+      polynomial_size, base_log, level_count);
+}
+
+template <typename Torus>
+__host__ void
+host_fp_keyswitch_lwe_to_glwe(cudaStream_t stream, uint32_t gpu_index,
+                              Torus *glwe_array_out, Torus *lwe_array_in,
+                              Torus *fp_ksk_array, uint32_t lwe_dimension_in,
+                              uint32_t glwe_dimension, uint32_t polynomial_size,
+                              uint32_t base_log, uint32_t level_count) {
+  cudaSetDevice(gpu_index);
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(glwe_accumulator_size, 512, num_blocks, num_threads);
+  int shared_mem = sizeof(Torus) * num_threads;
+  fp_keyswitch_lwe_to_glwe<<<num_blocks, num_threads, shared_mem, stream>>>(
+      glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
+      glwe_dimension, polynomial_size, base_log, level_count);
+  check_cuda_error(cudaGetLastError());
+}
+
+// public functional packing keyswitch
+//
+// blockIdx.y - input
+// chunk_count = glwe_size * polynomial_size / threads.
+template <typename Torus>
+__global__ void
+fp_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
+                              Torus *fp_ksk, uint32_t lwe_dimension_in,
+                              uint32_t glwe_dimension, uint32_t polynomial_size,
+                              uint32_t base_log, uint32_t level_count,
+                              uint32_t num_samples, Torus *d_mem) {
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  const int input_id = threadIdx.y + blockIdx.y * blockDim.y;
+  const int degree = input_id;
+
+  // Select a input
+  auto input_d_mem = d_mem + input_id * glwe_accumulator_size;
+
+  // KS LWE to GLWE
+  auto ks_glwe_out = input_d_mem;
+  auto lwe_in = lwe_array_in + (lwe_dimension_in + 1) * input_id;
+  keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+      ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
+      polynomial_size, base_log, level_count);
+
+  // P * x ^degree
+  auto glwe_out =
+      glwe_array_out + (glwe_dimension + 1) * polynomial_size * input_id;
+  polynomial_accumulate_monic_monomial_mul_batch(
+      glwe_out, ks_glwe_out, degree, polynomial_size, glwe_dimension + 1, true);
+}
+
+template <typename Torus>
+__global__ void
+accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in, uint32_t glwe_dimension,
+                 uint32_t polynomial_size, uint32_t num_samples) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  extern __shared__ int8_t sharedmem[];
+  const int shmem_index = threadIdx.x;
+
+  Torus *glwe_acc = (Torus *)sharedmem;
+  glwe_acc[shmem_index] = glwe_array_in[tid];
+
+  // Accumulate
+  for (int i = 1; i < num_samples; i++) {
+    auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
+    glwe_acc[shmem_index] += glwe_in[tid];
+  }
+
+  glwe_out[tid] = glwe_acc[shmem_index];
+}
+
+template <typename Torus>
+__host__ void host_fp_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
+    Torus *lwe_array_in, Torus *fp_ksk_array, uint32_t lwe_dimension_in,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples) {
+  cudaSetDevice(gpu_index);
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  int num_blocks = 0, num_threads_x = 0;
+  getNumBlocksAndThreads(glwe_accumulator_size, 512, num_blocks, num_threads_x);
+
+  auto shared_mem = sizeof(Torus) * num_threads_x;
+  dim3 grid(num_blocks, num_samples);
+  dim3 threads(num_threads_x);
+
+  auto d_mem = (Torus *)cuda_malloc_async(
+      num_samples * glwe_accumulator_size * sizeof(Torus), stream, gpu_index);
+  auto d_tmp_glwe_array_out = (Torus *)cuda_malloc_async(
+      num_samples * glwe_accumulator_size * sizeof(Torus), stream, gpu_index);
+
+  // individually keyswitch each lwe
+  fp_keyswitch_lwe_list_to_glwe<<<grid, threads, shared_mem, stream>>>(
+      d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
+      glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+      d_mem);
+  check_cuda_error(cudaGetLastError());
+  // accumulate to a single glwe
+  accumulate_glwes<<<num_blocks, num_threads_x, shared_mem, stream>>>(
+      glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
+      num_samples);
+  check_cuda_error(cudaGetLastError());
+
+  cuda_drop_async(d_mem, stream, gpu_index);
+  cuda_drop_async(d_tmp_glwe_array_out, stream, gpu_index);
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
index 9fc4ad1d8a..e5ba7e6b88 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -3,6 +3,7 @@
 
 #include "device.h"
 #include "types/int128.cuh"
+#include "utils/kernel_dimensions.cuh"
 #include <limits>
 
 template <typename T>
@@ -40,8 +41,8 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
 }
 
 template <typename T>
-__device__ __forceinline__ void modulus_switch(T input, T &output,
-                                               uint32_t log_modulus) {
+__device__ __forceinline__ void apply_modulus_switch(T input, T &output,
+                                                     uint32_t log_modulus) {
   constexpr uint32_t BITS = sizeof(T) * 8;
 
   output = input + (((T)1) << (BITS - log_modulus - 1));
@@ -49,10 +50,32 @@ __device__ __forceinline__ void modulus_switch(T input, T &output,
 }
 
 template <typename T>
-__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
+__device__ __forceinline__ T apply_modulus_switch(T input,
+                                                  uint32_t log_modulus) {
   T output;
-  modulus_switch(input, output, log_modulus);
+  apply_modulus_switch(input, output, log_modulus);
   return output;
 }
 
+template <typename Torus>
+__global__ void apply_modulus_switch_inplace(Torus *array, int size,
+                                             uint32_t log_modulus) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    array[tid] = apply_modulus_switch(array[tid], log_modulus);
+  }
+}
+
+template <typename Torus>
+__host__ void modulus_switch(cudaStream_t stream, uint32_t gpu_index,
+                             Torus *array, int size, uint32_t log_modulus) {
+
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
+
+  apply_modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(
+      array, size, log_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
new file mode 100644
index 0000000000..d0c2dd69d9
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -0,0 +1,50 @@
+#include "compression.cuh"
+
+void scratch_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);
+
+  scratch_cuda_compression_integer_radix_ciphertext_64(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_compression<uint64_t> **)mem_ptr, num_lwes, params, lwe_per_glwe,
+      storage_log_modulus, mode, allocate_gpu_memory);
+}
+void cuda_compression_compress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
+    int8_t *mem_ptr) {
+
+  host_integer_compression_compress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_lwes,
+      (int_compression<uint64_t> *)mem_ptr);
+}
+void cuda_compression_decompress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_out,
+    void *glwe_array_in, void **bsks, int8_t *mem_ptr) {
+
+  host_integer_compression_decompress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_out), static_cast<uint64_t *>(glwe_array_in),
+      bsks, (int_compression<uint64_t> *)mem_ptr);
+}
+
+void cleanup_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_compression<uint64_t> *mem_ptr =
+      (int_compression<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
new file mode 100644
index 0000000000..a7a85b31ed
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -0,0 +1,65 @@
+#ifndef CUDA_INTEGER_COMPRESSION_CUH
+#define CUDA_INTEGER_COMPRESSION_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/integer.cuh"
+#include "linearalgebra/multiplication.cuh"
+#include "polynomial/functions.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename Torus>
+__host__ void host_integer_compression_compress(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *glwe_array_out, Torus *lwe_array_in, Torus **fp_ksk,
+    uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
+  auto params = mem_ptr->params;
+
+  // Shift
+  auto lwe_shifted = mem_ptr->tmp_lwe_shifted;
+  host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
+                                lwe_array_in, (uint64_t)params.message_modulus,
+                                params.big_lwe_dimension, num_lwes);
+
+  uint32_t lwe_in_size = params.big_lwe_dimension + 1;
+  uint32_t glwe_out_size = (params.glwe_dimension + 1) * params.polynomial_size;
+  uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe;
+
+  // Keyswitch LWEs to GLWE
+  for (int i = 0; i < num_glwes; i++) {
+    auto lwe_subset = lwe_shifted + i * lwe_in_size;
+    auto glwe_out = glwe_array_out + i * glwe_out_size;
+
+    host_fp_keyswitch_lwe_list_to_glwe(
+        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+        params.big_lwe_dimension, params.glwe_dimension, params.polynomial_size,
+        params.ks_base_log, params.ks_level, mem_ptr->lwe_per_glwe);
+  }
+
+  // Modulus switch
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(glwe_out_size, 512, num_blocks, num_threads);
+  apply_modulus_switch_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
+      glwe_array_out, num_glwes * glwe_out_size, mem_ptr->storage_log_modulus);
+}
+
+template <typename Torus>
+__host__ void host_integer_compression_decompress(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_out, Torus *glwe_array_in, void **bsks,
+    int_compression<Torus> *mem_ptr) {}
+
+template <typename Torus>
+__host__ void scratch_cuda_compression_integer_radix_ciphertext_64(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_compression<Torus> **mem_ptr, uint32_t num_lwes,
+    int_radix_params params, uint32_t lwe_per_glwe,
+    uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_compression<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_lwes, lwe_per_glwe,
+      storage_log_modulus, mode, allocate_gpu_memory);
+}
+#endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
index 2a936d7cdf..a64c15378d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint32_t *>(lwe_array_out),
-                                static_cast<uint32_t *>(lwe_array_in),
-                                static_cast<uint32_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
  * Perform the multiplication of a u64 input LWE ciphertext vector with a u64
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint64_t *>(lwe_array_out),
-                                static_cast<uint64_t *>(lwe_array_in),
-                                static_cast<uint64_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
index 9fd9be9419..c02e9236f1 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -13,9 +13,47 @@
 #include <iostream>
 #include <vector>
 
+template <typename T>
+__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
+                                             T *cleartext_input,
+                                             uint32_t input_lwe_dimension,
+                                             uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    int cleartext_index = index / (input_lwe_dimension + 1);
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] * cleartext_input[cleartext_index];
+  }
+}
+
+template <typename T>
+__host__ void
+host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
+                                  T *output, T *lwe_input, T *cleartext_input,
+                                  uint32_t input_lwe_dimension,
+                                  uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cleartext_vec_multiplication<<<grid, thds, 0, stream>>>(
+      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename T>
 __global__ void
-cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
+cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
                          uint32_t input_lwe_dimension, uint32_t num_entries) {
 
   int tid = threadIdx.x;
@@ -23,14 +61,14 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
   if (index < num_entries) {
     int cleartext_index = index / (input_lwe_dimension + 1);
     // Here we take advantage of the wrapping behaviour of uint
-    output[index] = lwe_input[index] * cleartext_input[cleartext_index];
+    output[index] = lwe_input[index] * cleartext_input;
   }
 }
 
 template <typename T>
 __host__ void
 host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
-                              T *output, T *lwe_input, T *cleartext_input,
+                              T *output, T *lwe_input, T cleartext_input,
                               uint32_t input_lwe_dimension,
                               uint32_t input_lwe_ciphertext_count) {
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
index 668883e044..b1ef7f087b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -89,8 +89,8 @@ __global__ void device_programmable_bootstrap_amortized(
 
   // Put "b", the body, in [0, 2N[
   Torus b_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                 params::log2_degree + 1);
+  apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                       params::log2_degree + 1);
 
   divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                         params::degree / params::opt>(
@@ -105,8 +105,8 @@ __global__ void device_programmable_bootstrap_amortized(
 
     // Put "a" in [0, 2N[ instead of Zq
     Torus a_hat = 0;
-    modulus_switch(block_lwe_array_in[iteration], a_hat,
-                   params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[iteration], a_hat,
+                         params::log2_degree + 1);
 
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 3b0eecce7b..c47febd6a8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -93,8 +93,8 @@ __global__ void device_programmable_bootstrap_cg(
 
   // Put "b" in [0, 2N[
   Torus b_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                 params::log2_degree + 1);
+  apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                       params::log2_degree + 1);
 
   divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                         params::degree / params::opt>(
@@ -106,7 +106,7 @@ __global__ void device_programmable_bootstrap_cg(
 
     // Put "a" in [0, 2N[
     Torus a_hat = 0;
-    modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
 
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index d8e17c936f..27d398cf99 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -79,8 +79,8 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
     Torus b_hat = 0;
-    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                   params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                         params::log2_degree + 1);
 
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index 8a60c6b778..4c315192d4 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -74,8 +74,8 @@ __global__ void device_programmable_bootstrap_step_one(
     // First iteration
     // Put "b" in [0, 2N[
     Torus b_hat = 0;
-    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                   params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                         params::log2_degree + 1);
     // The y-dimension is used to select the element of the GLWE this block will
     // compute
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
@@ -93,8 +93,9 @@ __global__ void device_programmable_bootstrap_step_one(
 
   // Put "a" in [0, 2N[
   Torus a_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
-                 params::log2_degree + 1); // 2 * params::log2_degree + 1);
+  apply_modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
+                       params::log2_degree +
+                           1); // 2 * params::log2_degree + 1);
 
   synchronize_threads_in_block();
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index eee6e90060..214015676f 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -28,7 +28,7 @@ __device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
     x += selection_bit * lwe_array_group[i];
   }
 
-  return modulus_switch(x, params::log2_degree + 1);
+  return apply_modulus_switch(x, params::log2_degree + 1);
 }
 
 template <typename Torus, class params, sharedMemDegree SMD>
@@ -102,7 +102,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 
       synchronize_threads_in_block();
       // Multiply by the bsk element
-      polynomial_product_accumulate_by_monomial<Torus, params>(
+      polynomial_accumulate_monic_monomial_mul<Torus, params>(
           accumulator, bsk_poly, monomial_degree, false);
     }
 
@@ -203,8 +203,8 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
     // Initializes the accumulator with the body of LWE
     // Put "b" in [0, 2N[
     Torus b_hat = 0;
-    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                   params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                         params::log2_degree + 1);
 
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index b19cde44d9..abd35ee8fe 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -96,8 +96,8 @@ __global__ void device_programmable_bootstrap_tbc(
 
   // Put "b" in [0, 2N[
   Torus b_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                 params::log2_degree + 1);
+  apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                       params::log2_degree + 1);
 
   divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                         params::degree / params::opt>(
@@ -109,8 +109,9 @@ __global__ void device_programmable_bootstrap_tbc(
 
     // Put "a" in [0, 2N[
     Torus a_hat = 0;
-    modulus_switch(block_lwe_array_in[i], a_hat,
-                   params::log2_degree + 1); // 2 * params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[i], a_hat,
+                         params::log2_degree +
+                             1); // 2 * params::log2_degree + 1);
 
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index fd143e494e..49087e92a6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -86,8 +86,8 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
     Torus b_hat = 0;
-    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                   params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                         params::log2_degree + 1);
 
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
index a6d7901f20..8e3b2d9c94 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -55,13 +55,14 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
   }
 }
 
-// If init_accumulator is set, assumes that result was not initialized and does
-// that with the outcome of first * second
+// This method expects to work with polynomial_size / params::opt threads in the
+// x-block If init_accumulator is set, assumes that result was not initialized
+// and does that with the outcome of first * second
 template <typename T, class params>
 __device__ void
-polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
-                                          uint64_t monomial_degree,
-                                          bool init_accumulator = false) {
+polynomial_accumulate_monic_monomial_mul(T *result, const T *__restrict__ poly,
+                                         uint64_t monomial_degree,
+                                         bool init_accumulator = false) {
   // monomial_degree \in [0, 2 * params::degree)
   int full_cycles_count = monomial_degree / params::degree;
   int remainder_degrees = monomial_degree % params::degree;
@@ -82,4 +83,37 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
   }
 }
 
+// This method expects to work with num_poly * polynomial_size threads in the
+// grid
+template <typename T>
+__device__ void polynomial_accumulate_monic_monomial_mul_batch(
+    T *result_array, T *poly_array, uint64_t monomial_degree,
+    uint32_t polynomial_size, uint32_t num_poly,
+    bool init_accumulator = false) {
+  // monomial_degree \in [0, 2 * params::degree)
+  int full_cycles_count = monomial_degree / polynomial_size;
+  int remainder_degrees = monomial_degree % polynomial_size;
+
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int pos = tid % polynomial_size;
+
+  // Select a input
+  auto poly = poly_array + (tid / polynomial_size) * polynomial_size;
+  auto result = result_array + (tid / polynomial_size) * polynomial_size;
+
+  // Calculate the rotation
+  T element = poly[pos];
+  int new_pos = (pos + monomial_degree) % polynomial_size;
+
+  // Calculate the new coefficient
+  T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
+  x = SEL(-x, x, new_pos >= remainder_degrees);
+
+  // Write result
+  if (init_accumulator)
+    result[new_pos] = x;
+  else
+    result[new_pos] += x;
+}
+
 #endif // CNCRT_POLYNOMIAL_MATH_H
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index a0f0d317d5..2ced0e87e1 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -342,6 +342,33 @@ extern "C" {
         num_samples: u32,
     );
 
+    pub fn cuda_fp_keyswitch_lwe_to_glwe_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        glwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        fp_ksk_array: *const c_void,
+        input_lwe_dimension: u32,
+        output_glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+    );
+
+    pub fn cuda_fp_keyswitch_lwe_list_to_glwe_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        glwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        fp_ksk_array: *const c_void,
+        input_lwe_dimension: u32,
+        output_glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_lwes: u32,
+    );
+
     /// Perform the negation of a u64 input LWE ciphertext vector.
     /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
     /// - `gpu_index` is the index of the GPU to be used in the kernel launch
@@ -516,6 +543,69 @@ extern "C" {
         mem_ptr: *mut *mut i8,
     );
 
+    pub fn scratch_cuda_compression_integer_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_lwes: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        lwe_per_glwe: u32,
+        storage_log_modulus: u32,
+        mode: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_compression_compress_integer_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        glwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        fp_ksk: *const *mut c_void,
+        num_lwes: u32,
+        mem_ptr: *mut i8,
+    );
+
+    pub fn cuda_compression_decompress_integer_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_out: *mut c_void,
+        glwe_array_in: *const c_void,
+        bsks: *const *mut c_void,
+        mem_ptr: *mut i8,
+    );
+
+    pub fn cleanup_cuda_compression_integer_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+    );
+
+    pub fn cuda_glwe_sample_extract_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut c_void,
+        glwe_in: *const c_void,
+        nth_array: *const u32,
+        num_samples: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+    );
+
     pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
@@ -628,17 +718,6 @@ extern "C" {
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
-    pub fn cuda_glwe_sample_extract_64(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        lwe_array_out: *mut c_void,
-        glwe_in: *const c_void,
-        nth_array: *const u32,
-        num_samples: u32,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-    );
 
     pub fn scratch_cuda_integer_radix_comparison_kb_64(
         streams: *const *mut c_void,
diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
new file mode 100644
index 0000000000..5b4565cf1e
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
@@ -0,0 +1,62 @@
+use crate::core_crypto::gpu::glwe_ciphertext::CudaGlweCiphertext;
+use crate::core_crypto::gpu::lwe_ciphertext::CudaLweCiphertext;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+use crate::core_crypto::gpu::{packing_keyswitch_async, packing_keyswitch_list_async, CudaStreams};
+use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
+
+pub fn cuda_keyswitch_lwe_ciphertext_into_glwe_ciphertext<Scalar>(
+    lwe_pksk: &CudaLwePackingKeyswitchKey<Scalar>,
+    input_lwe_ciphertext: &CudaLweCiphertext<Scalar>,
+    output_glwe_ciphertext: &mut CudaGlweCiphertext<Scalar>,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus + CastInto<usize>,
+{
+    let input_lwe_dimension = input_lwe_ciphertext.lwe_dimension();
+    let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension();
+    let output_polynomial_size = output_glwe_ciphertext.polynomial_size();
+
+    unsafe {
+        packing_keyswitch_async(
+            streams,
+            &mut output_glwe_ciphertext.0.d_vec,
+            &input_lwe_ciphertext.0.d_vec,
+            input_lwe_dimension,
+            output_glwe_dimension,
+            output_polynomial_size,
+            &lwe_pksk.d_vec,
+            lwe_pksk.decomposition_base_log(),
+            lwe_pksk.decomposition_level_count(),
+        );
+    }
+}
+pub fn cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext<Scalar>(
+    lwe_pksk: &CudaLwePackingKeyswitchKey<Scalar>,
+    input_lwe_ciphertext_list: &CudaLweCiphertextList<Scalar>,
+    output_glwe_ciphertext: &mut CudaGlweCiphertext<Scalar>,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus + CastInto<usize>,
+{
+    let input_lwe_dimension = input_lwe_ciphertext_list.lwe_dimension();
+    let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension();
+    let output_polynomial_size = output_glwe_ciphertext.polynomial_size();
+
+    unsafe {
+        packing_keyswitch_list_async(
+            streams,
+            &mut output_glwe_ciphertext.0.d_vec,
+            &input_lwe_ciphertext_list.0.d_vec,
+            input_lwe_dimension,
+            output_glwe_dimension,
+            output_polynomial_size,
+            &lwe_pksk.d_vec,
+            lwe_pksk.decomposition_base_log(),
+            lwe_pksk.decomposition_level_count(),
+            input_lwe_ciphertext_list.lwe_ciphertext_count(),
+        );
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
index ee6ddd2b1d..489bbb1565 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
@@ -1,13 +1,15 @@
+pub mod glwe_sample_extraction;
+pub mod lwe_keyswitch;
 pub mod lwe_linear_algebra;
 pub mod lwe_multi_bit_programmable_bootstrapping;
+pub mod lwe_packing_keyswitch;
 pub mod lwe_programmable_bootstrapping;
 
-pub mod glwe_sample_extraction;
-mod lwe_keyswitch;
 #[cfg(test)]
 mod test;
 
 pub use lwe_keyswitch::*;
 pub use lwe_linear_algebra::*;
 pub use lwe_multi_bit_programmable_bootstrapping::*;
+pub use lwe_packing_keyswitch::*;
 pub use lwe_programmable_bootstrapping::*;
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs
new file mode 100644
index 0000000000..ec5f309f69
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs
@@ -0,0 +1,239 @@
+use super::*;
+use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext;
+use crate::core_crypto::gpu::glwe_ciphertext::CudaGlweCiphertext;
+use crate::core_crypto::gpu::lwe_ciphertext::CudaLweCiphertext;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::{cuda_keyswitch_lwe_ciphertext_into_glwe_ciphertext, CudaStreams};
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+
+const NB_TESTS: usize = 10;
+fn generate_keys<Scalar: UnsignedTorus + Sync + Send + Serialize + DeserializeOwned>(
+    params: PackingKeySwitchTestParams<Scalar>,
+    streams: &CudaStreams,
+    rsc: &mut TestResources,
+) -> CudaPackingKeySwitchKeys<Scalar> {
+    let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
+        params.lwe_dimension,
+        &mut rsc.secret_random_generator,
+    );
+
+    let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
+        params.glwe_dimension,
+        params.polynomial_size,
+        &mut rsc.secret_random_generator,
+    );
+
+    let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
+        &lwe_sk,
+        &glwe_sk,
+        params.pbs_base_log,
+        params.pbs_level,
+        params.glwe_noise_distribution,
+        params.ciphertext_modulus,
+        &mut rsc.encryption_random_generator,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &pksk,
+        params.ciphertext_modulus
+    ));
+
+    let cuda_pksk = CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(&pksk, &streams);
+
+    CudaPackingKeySwitchKeys {
+        lwe_sk,
+        glwe_sk,
+        pksk: cuda_pksk,
+        h_pksk: pksk,
+    }
+}
+
+fn lwe_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
+where
+    Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
+    P: Into<PackingKeySwitchTestParams<Scalar>>,
+    PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
+{
+    let params = params.into();
+
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+
+    let mut rsc = TestResources::new();
+
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let mut msg = msg_modulus;
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(gpu_index);
+
+    while msg != Scalar::ZERO {
+        msg = msg.wrapping_sub(Scalar::ONE);
+        for _ in 0..NB_TESTS {
+            let keys = generate_keys(params, &stream, &mut rsc);
+            let (h_pksk, pksk, lwe_sk, glwe_sk) =
+                (keys.h_pksk, keys.pksk, keys.lwe_sk, keys.glwe_sk);
+
+            let plaintext = Plaintext(msg * delta);
+
+            let input_lwe = allocate_and_encrypt_new_lwe_ciphertext(
+                &lwe_sk,
+                plaintext,
+                lwe_noise_distribution,
+                ciphertext_modulus,
+                &mut rsc.encryption_random_generator,
+            );
+
+            let d_input_lwe = CudaLweCiphertext::from_lwe_ciphertext(&input_lwe, &stream);
+
+            assert!(check_encrypted_content_respects_mod(
+                &input_lwe,
+                ciphertext_modulus
+            ));
+
+            let mut d_output_glwe = CudaGlweCiphertext::new(
+                glwe_sk.glwe_dimension(),
+                glwe_sk.polynomial_size(),
+                ciphertext_modulus,
+                &stream,
+            );
+
+            cuda_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+                &pksk,
+                &d_input_lwe,
+                &mut d_output_glwe,
+                &stream,
+            );
+
+            let output_glwe = d_output_glwe.to_glwe_ciphertext(&stream);
+
+            let mut decrypted_plaintext_list = PlaintextList::new(
+                Scalar::ZERO,
+                PlaintextCount(output_glwe.polynomial_size().0),
+            );
+
+            decrypt_glwe_ciphertext(&glwe_sk, &output_glwe, &mut decrypted_plaintext_list);
+
+            let decoded = round_decode(*decrypted_plaintext_list.get(0).0, delta) % msg_modulus;
+
+            assert_eq!(msg, decoded);
+        }
+
+        // In coverage, we break after one while loop iteration, changing message values does not
+        // yield higher coverage
+        #[cfg(tarpaulin)]
+        break;
+    }
+}
+
+fn lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
+where
+    Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
+    P: Into<PackingKeySwitchTestParams<Scalar>>,
+    PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
+{
+    let params = params.into();
+
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+
+    let mut rsc = TestResources::new();
+
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let mut msg = msg_modulus;
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(gpu_index);
+
+    // while msg != Scalar::ZERO {
+    msg = msg.wrapping_sub(Scalar::ONE);
+    for _ in 0..NB_TESTS {
+        let keys = generate_keys(params, &stream, &mut rsc);
+        let (h_pksk, pksk, lwe_sk, glwe_sk) = (keys.h_pksk, keys.pksk, keys.lwe_sk, keys.glwe_sk);
+
+        let mut input_lwe_list = LweCiphertextList::new(
+            Scalar::ZERO,
+            lwe_sk.lwe_dimension().to_lwe_size(),
+            LweCiphertextCount(glwe_sk.polynomial_size().0),
+            ciphertext_modulus,
+        );
+
+        let mut input_plaintext_list =
+            PlaintextList::new(msg * delta, PlaintextCount(glwe_sk.polynomial_size().0));
+
+        encrypt_lwe_ciphertext_list(
+            &lwe_sk,
+            &mut input_lwe_list,
+            &input_plaintext_list,
+            lwe_noise_distribution,
+            &mut rsc.encryption_random_generator,
+        );
+
+        let d_input_lwe_list =
+            CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream);
+
+        assert!(check_encrypted_content_respects_mod(
+            &input_lwe_list,
+            ciphertext_modulus
+        ));
+
+        let mut d_output_glwe = CudaGlweCiphertext::new(
+            glwe_sk.glwe_dimension(),
+            glwe_sk.polynomial_size(),
+            ciphertext_modulus,
+            &stream,
+        );
+
+        cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext(
+            &pksk,
+            &d_input_lwe_list,
+            &mut d_output_glwe,
+            &stream,
+        );
+
+        let mut reference_output_glwe = GlweCiphertext::new(
+            Scalar::ZERO,
+            glwe_sk.glwe_dimension().to_glwe_size(),
+            glwe_sk.polynomial_size(),
+            ciphertext_modulus,
+        );
+
+        keyswitch_lwe_ciphertext_list_and_pack_in_glwe_ciphertext(
+            &h_pksk,
+            &input_lwe_list,
+            &mut reference_output_glwe,
+        );
+
+        let output_glwe = d_output_glwe.to_glwe_ciphertext(&stream);
+
+        let mut decrypted_plaintext_list = PlaintextList::new(
+            Scalar::ZERO,
+            PlaintextCount(output_glwe.polynomial_size().0),
+        );
+
+        decrypt_glwe_ciphertext(&glwe_sk, &output_glwe, &mut decrypted_plaintext_list);
+
+        decrypted_plaintext_list
+            .iter_mut()
+            .for_each(|x| *x.0 = round_decode(*x.0, delta) % msg_modulus);
+        input_plaintext_list.iter_mut().for_each(|x| *x.0 /= delta);
+
+        assert_eq!(decrypted_plaintext_list, input_plaintext_list);
+    }
+
+    // In coverage, we break after one while loop iteration, changing message values does not
+    // yield higher coverage
+    #[cfg(tarpaulin)]
+    break;
+    // }
+}
+
+create_gpu_parametrized_test!(lwe_encrypt_pks_to_glwe_decrypt_custom_mod);
+create_gpu_parametrized_test!(lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod);
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
index 59ba20af18..7235d3e1ac 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
@@ -4,8 +4,16 @@ mod glwe_sample_extraction;
 mod lwe_keyswitch;
 mod lwe_linear_algebra;
 mod lwe_multi_bit_programmable_bootstrapping;
+mod lwe_packing_keyswitch;
 mod lwe_programmable_bootstrapping;
 
+pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
+    pub lwe_sk: LweSecretKey<Vec<Scalar>>,
+    pub glwe_sk: GlweSecretKey<Vec<Scalar>>,
+    pub pksk: CudaLwePackingKeyswitchKey<Scalar>,
+    pub h_pksk: LwePackingKeyswitchKeyOwned<Scalar>,
+}
+
 // Macro to generate tests for all parameter sets
 macro_rules! create_gpu_parametrized_test{
     ($name:ident { $($param:ident),* }) => {
@@ -47,4 +55,5 @@ macro_rules! create_gpu_multi_bit_parametrized_test{
     };
 }
 
+use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
 use {create_gpu_multi_bit_parametrized_test, create_gpu_parametrized_test};
diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
index 9defa579f6..9fc0fad460 100644
--- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
@@ -4,6 +4,7 @@ use crate::core_crypto::prelude::{
     CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
     LweDimension, LweSize, UnsignedInteger,
 };
+use std::cmp::min;
 use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu;
 
 /// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU.
@@ -229,11 +230,7 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
         let ciphertext_modulus = self.ciphertext_modulus();
 
         // Copy to the GPU
-        let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams, 0);
-        unsafe {
-            d_vec.copy_from_gpu_async(&self.0.d_vec, streams, 0);
-        }
-        streams.synchronize();
+        let d_vec = unsafe { self.0.d_vec.duplicate(streams, 0) };
 
         let cuda_lwe_list = CudaLweList {
             d_vec,
@@ -244,6 +241,62 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
         Self(cuda_lwe_list)
     }
 
+    // Retrieve [start,end) lwe ciphertexts in the list
+    pub fn retrieve_subset(
+        &self,
+        start: usize,
+        end: usize,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> Option<Self> {
+        if end < start {
+            None
+        } else {
+            let lwe_dimension = self.lwe_dimension();
+            let lwe_ciphertext_count =
+                LweCiphertextCount(min(end - start, self.lwe_ciphertext_count().0));
+            let ciphertext_modulus = self.ciphertext_modulus();
+
+            // Copy to the GPU
+            let d_vec = unsafe {
+                let mut d_vec = CudaVec::new_async(
+                    lwe_ciphertext_count.0 * lwe_dimension.to_lwe_size().0,
+                    streams,
+                    gpu_index,
+                );
+                // Todo: We might use copy_src_range_gpu_to_gpu_async here
+                let src_ptr = self
+                    .0
+                    .d_vec
+                    .as_c_ptr(gpu_index)
+                    .add(start * lwe_dimension.to_lwe_size().0 * std::mem::size_of::<T>());
+                let size = lwe_ciphertext_count.0
+                    * lwe_dimension.to_lwe_size().0
+                    * std::mem::size_of::<T>();
+                cuda_memcpy_async_gpu_to_gpu(
+                    d_vec.as_mut_c_ptr(gpu_index),
+                    src_ptr,
+                    size as u64,
+                    streams.ptr[gpu_index as usize],
+                    streams.gpu_indexes[gpu_index as usize],
+                );
+
+                d_vec
+            };
+
+            streams.synchronize();
+
+            let cuda_lwe_list = CudaLweList {
+                d_vec,
+                lwe_ciphertext_count,
+                lwe_dimension,
+                ciphertext_modulus,
+            };
+
+            Some(CudaLweCiphertextList { 0: cuda_lwe_list })
+        }
+    }
+
     pub(crate) fn lwe_dimension(&self) -> LweDimension {
         self.0.lwe_dimension
     }
diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs
new file mode 100644
index 0000000000..19de6d7a18
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs
@@ -0,0 +1,87 @@
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::{convert_lwe_keyswitch_key_async, CudaStreams};
+use crate::core_crypto::prelude::{
+    lwe_packing_keyswitch_key_input_key_element_encrypted_size, CiphertextModulus,
+    DecompositionBaseLog, DecompositionLevelCount, GlweSize, LweDimension,
+    LwePackingKeyswitchKeyOwned, PolynomialSize, UnsignedInteger,
+};
+
+/// A keyswitching key allowing to keyswitch [`an LWE ciphertext`](super::LweCiphertext) to
+/// [`a GLWE ciphertext`](super::GlweCiphertext) allowing to pack several LWE ciphertexts into a
+/// GLWE ciphertext.
+#[derive(Debug)]
+pub struct CudaLwePackingKeyswitchKey<T: UnsignedInteger> {
+    pub(crate) d_vec: CudaVec<T>,
+    decomp_base_log: DecompositionBaseLog,
+    decomp_level_count: DecompositionLevelCount,
+    output_glwe_size: GlweSize,
+    output_polynomial_size: PolynomialSize,
+    ciphertext_modulus: CiphertextModulus<T>,
+}
+
+impl<T: UnsignedInteger> CudaLwePackingKeyswitchKey<T> {
+    pub fn from_lwe_packing_keyswitch_key(
+        h_ksk: &LwePackingKeyswitchKeyOwned<T>,
+        streams: &CudaStreams,
+    ) -> Self {
+        let decomp_base_log = h_ksk.decomposition_base_log();
+        let decomp_level_count = h_ksk.decomposition_level_count();
+        let input_lwe_size = h_ksk.input_key_lwe_dimension().to_lwe_size();
+        let output_glwe_size = h_ksk.output_key_glwe_dimension().to_glwe_size();
+        let output_polynomial_size = h_ksk.output_polynomial_size();
+        let ciphertext_modulus = h_ksk.ciphertext_modulus();
+
+        // Allocate memory
+        let mut d_vec = CudaVec::<T>::new_multi_gpu(
+            input_lwe_size.to_lwe_dimension().0
+                * lwe_packing_keyswitch_key_input_key_element_encrypted_size(
+                    decomp_level_count,
+                    output_glwe_size,
+                    output_polynomial_size,
+                ),
+            streams,
+        );
+
+        unsafe {
+            convert_lwe_keyswitch_key_async(streams, &mut d_vec, h_ksk.as_ref());
+        }
+
+        streams.synchronize();
+
+        Self {
+            d_vec,
+            decomp_base_log,
+            decomp_level_count,
+            output_glwe_size,
+            output_polynomial_size,
+            ciphertext_modulus,
+        }
+    }
+
+    pub(crate) fn decomposition_base_log(&self) -> DecompositionBaseLog {
+        self.decomp_base_log
+    }
+    pub(crate) fn decomposition_level_count(&self) -> DecompositionLevelCount {
+        self.decomp_level_count
+    }
+
+    pub(crate) fn output_glwe_size(&self) -> GlweSize {
+        self.output_glwe_size
+    }
+    pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus<T> {
+        self.ciphertext_modulus
+    }
+    pub(crate) fn output_polynomial_size(&self) -> PolynomialSize {
+        self.output_polynomial_size
+    }
+    pub fn input_key_lwe_dimension(&self) -> LweDimension {
+        LweDimension(
+            self.d_vec.len
+                / lwe_packing_keyswitch_key_input_key_element_encrypted_size(
+                    self.decomp_level_count,
+                    self.output_glwe_size,
+                    self.output_polynomial_size,
+                ),
+        )
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/entities/mod.rs b/tfhe/src/core_crypto/gpu/entities/mod.rs
index 38a3e27bab..f76f8b5bde 100644
--- a/tfhe/src/core_crypto/gpu/entities/mod.rs
+++ b/tfhe/src/core_crypto/gpu/entities/mod.rs
@@ -5,3 +5,4 @@ pub mod lwe_ciphertext;
 pub mod lwe_ciphertext_list;
 pub mod lwe_keyswitch_key;
 pub mod lwe_multi_bit_bootstrap_key;
+pub mod lwe_packing_keyswitch_key;
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 22edcfba73..a6e57c11c8 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -275,6 +275,72 @@ pub unsafe fn convert_lwe_keyswitch_key_async<T: UnsignedInteger>(
     dest.copy_from_cpu_multi_gpu_async(src, streams);
 }
 
+/// Discarding packing keyswitch on a single LWE ciphertext
+///
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn packing_keyswitch_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    glwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    input_lwe_dimension: LweDimension,
+    output_glwe_dimension: GlweDimension,
+    output_polynomial_size: PolynomialSize,
+    fp_keyswitch_key: &CudaVec<T>,
+    base_log: DecompositionBaseLog,
+    l_gadget: DecompositionLevelCount,
+) {
+    cuda_fp_keyswitch_lwe_to_glwe_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0],
+        glwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
+        fp_keyswitch_key.as_c_ptr(0),
+        input_lwe_dimension.0 as u32,
+        output_glwe_dimension.0 as u32,
+        output_polynomial_size.0 as u32,
+        base_log.0 as u32,
+        l_gadget.0 as u32,
+    );
+}
+
+/// Discarding packing keyswitch on a vector of LWE ciphertexts
+///
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn packing_keyswitch_list_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    glwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    input_lwe_dimension: LweDimension,
+    output_glwe_dimension: GlweDimension,
+    output_polynomial_size: PolynomialSize,
+    fp_keyswitch_key: &CudaVec<T>,
+    base_log: DecompositionBaseLog,
+    l_gadget: DecompositionLevelCount,
+    num_lwes: LweCiphertextCount,
+) {
+    cuda_fp_keyswitch_lwe_list_to_glwe_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0],
+        glwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
+        fp_keyswitch_key.as_c_ptr(0),
+        input_lwe_dimension.0 as u32,
+        output_glwe_dimension.0 as u32,
+        output_polynomial_size.0 as u32,
+        base_log.0 as u32,
+        l_gadget.0 as u32,
+        num_lwes.0 as u32,
+    );
+}
+
 /// Convert programmable bootstrap key
 ///
 /// # Safety
diff --git a/tfhe/src/integer/client_key/radix.rs b/tfhe/src/integer/client_key/radix.rs
index 36333ddbc2..56d6493a2b 100644
--- a/tfhe/src/integer/client_key/radix.rs
+++ b/tfhe/src/integer/client_key/radix.rs
@@ -1,10 +1,16 @@
 //! Definition of the client key for radix decomposition
 
 use super::{ClientKey, RecomposableSignedInteger, SecretEncryptionKeyView};
+#[cfg(feature = "gpu")]
+use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::{SignedNumeric, UnsignedNumeric};
 use crate::integer::backward_compatibility::client_key::RadixClientKeyVersions;
 use crate::integer::block_decomposition::{DecomposableInto, RecomposableFrom};
 use crate::integer::ciphertext::{RadixCiphertext, SignedRadixCiphertext};
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::list_compression::server_keys::{
+    CudaCompressionKey, CudaDecompressionKey,
+};
 use crate::integer::BooleanBlock;
 use crate::shortint::{Ciphertext as ShortintCiphertext, PBSParameters as ShortintParameters};
 use serde::{Deserialize, Serialize};
@@ -131,6 +137,32 @@ impl RadixClientKey {
     pub fn num_blocks(&self) -> usize {
         self.num_blocks
     }
+
+    pub fn new_compression_private_key(
+        &self,
+        params: CompressionParameters,
+    ) -> CompressionPrivateKeys {
+        self.key.key.new_compression_private_key(params)
+    }
+
+    pub fn new_compression_decompression_keys(
+        &self,
+        private_compression_key: &CompressionPrivateKeys,
+    ) -> (CompressionKey, DecompressionKey) {
+        self.key
+            .key
+            .new_compression_decompression_keys(private_compression_key)
+    }
+    #[cfg(feature = "gpu")]
+    pub fn new_cuda_compression_decompression_keys(
+        &self,
+        private_compression_key: &CompressionPrivateKeys,
+        streams: &CudaStreams,
+    ) -> (CudaCompressionKey, CudaDecompressionKey) {
+        self.key
+            .key
+            .new_cuda_compression_decompression_keys(private_compression_key, streams)
+    }
 }
 
 impl From<(ClientKey, usize)> for RadixClientKey {
diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
new file mode 100644
index 0000000000..3d2fa6bd20
--- /dev/null
+++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
@@ -0,0 +1,196 @@
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::LweCiphertextCount;
+use crate::integer::ciphertext::DataKind;
+use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
+use crate::integer::gpu::ciphertext::{
+    CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
+    CudaUnsignedRadixCiphertext,
+};
+use crate::integer::gpu::list_compression::server_keys::{
+    CudaCompressionKey, CudaDecompressionKey,
+};
+use itertools::Itertools;
+use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu;
+
+pub struct CudaCompressedCiphertextList {
+    pub(crate) packed_list: (CudaGlweCiphertextList<u64>, Vec<CudaBlockInfo>),
+    info: Vec<DataKind>,
+}
+impl CudaCompressedCiphertextList {
+    pub fn len(&self) -> usize {
+        self.info.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.info.len() == 0
+    }
+
+    pub fn blocks_of(
+        &self,
+        index: usize,
+        decomp_key: &CudaDecompressionKey,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> Option<(CudaRadixCiphertext, DataKind)> {
+        let preceding_infos = self.info.get(..index)?;
+        let current_info = self.info.get(index).copied()?;
+
+        let start_block_index: usize = preceding_infos
+            .iter()
+            .copied()
+            .map(DataKind::num_blocks)
+            .sum();
+
+        let end_block_index = start_block_index + current_info.num_blocks();
+
+        Some((
+            decomp_key.unpack(
+                &self.packed_list,
+                start_block_index,
+                end_block_index,
+                streams,
+                gpu_index,
+            ),
+            current_info,
+        ))
+    }
+}
+
+pub trait CudaCompressible {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind;
+}
+
+// Todo: Can we combine these two impl using CudaIntegerRadixCiphertext?
+impl CudaCompressible for CudaSignedRadixCiphertext {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind {
+        let x = self.ciphertext.duplicate(streams);
+
+        let copy = x.duplicate(streams);
+        messages.push(copy);
+
+        let num_blocks = x.d_blocks.lwe_ciphertext_count().0;
+        DataKind::Signed(num_blocks)
+    }
+}
+impl CudaCompressible for CudaUnsignedRadixCiphertext {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind {
+        let x = self.ciphertext.duplicate(streams);
+
+        let copy = x.duplicate(streams);
+        messages.push(copy);
+
+        let num_blocks = x.d_blocks.lwe_ciphertext_count().0;
+
+        DataKind::Unsigned(num_blocks)
+    }
+}
+
+pub struct CompressedCudaCiphertextListBuilder {
+    pub(crate) ciphertexts: Vec<CudaRadixCiphertext>,
+    pub(crate) info: Vec<DataKind>,
+}
+
+impl CompressedCudaCiphertextListBuilder {
+    #[allow(clippy::new_without_default)]
+    pub fn new() -> Self {
+        Self {
+            ciphertexts: vec![],
+            info: vec![],
+        }
+    }
+
+    /// ```rust
+    /// use tfhe::CompressedCiphertextListBuilder;use tfhe::core_crypto::gpu::CudaStreams;use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
+    /// use tfhe::integer::gpu::gen_keys_radix_gpu;use tfhe::integer::{IntegerCiphertext, RadixCiphertext, RadixClientKey, SignedRadixCiphertext};
+    /// use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CompressedCudaCiphertextListBuilder;
+    /// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    /// use tfhe::shortint::prelude::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
+    /// use tfhe::integer::gpu::list_compression::server_keys::*;
+    ///
+    /// let gpu_index = 0;
+    /// let mut streams = CudaStreams::new_single_gpu(gpu_index);
+    ///
+    /// // Generate the client key and the server key:
+    /// let num_blocks = 4;
+    /// let (cks, sks) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, num_blocks, &mut streams);    ///
+    ///
+    /// let private_compression_key =
+    ///       cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
+    ///
+    /// let (cuda_compression_key, cuda_decompression_key) =
+    ///     cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams);
+    ///
+    /// let ct1 = cks.encrypt(3_u32);
+    /// let ct2 = cks.encrypt(2_u32);
+    /// let ct3 = cks.encrypt_signed(-2);
+    /// // Copy to GPU
+    /// let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &mut streams);
+    /// let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &mut streams);
+    /// let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &mut streams);
+    ///
+    /// let compressed = CompressedCudaCiphertextListBuilder::new()
+    ///       .push(d_ct1, &streams)
+    ///       .push(d_ct2, &streams).push(d_ct3, &streams).
+    ///     build(&cuda_compression_key, &streams,0);
+    ///
+    /// let a = CudaUnsignedRadixCiphertext {ciphertext: compressed.blocks_of(0, &cuda_decompression_key,
+    /// &streams, 0).unwrap().0};
+    ///
+    /// let result = a.to_radix_ciphertext(&streams);
+    /// let decrypted: u32 = cks.decrypt(&result);
+    /// assert_eq!(decrypted, 3_u32);
+    ///
+    /// let b = CudaUnsignedRadixCiphertext {ciphertext: compressed.blocks_of(1,
+    /// &cuda_decompression_key, &streams, 0).unwrap().0};
+    ///
+    /// let result = b.to_radix_ciphertext(&streams);
+    /// let decrypted: u32 =
+    /// cks.decrypt(&result);
+    /// assert_eq!(decrypted, 2_u32);
+    ///
+    /// let c = CudaSignedRadixCiphertext {ciphertext: compressed.blocks_of(2,
+    /// &cuda_decompression_key, &streams, 0).unwrap().0};
+    ///
+    /// let result = c.to_signed_radix_ciphertext(&streams);
+    /// let decrypted: i32 =
+    /// cks.decrypt_signed(&result);
+    /// assert_eq!(decrypted, -2);
+    pub fn push<T: CudaCompressible>(&mut self, data: T, streams: &CudaStreams) -> &mut Self {
+        let kind = data.compress_into(&mut self.ciphertexts, streams);
+
+        if kind.num_blocks() != 0 {
+            self.info.push(kind);
+        }
+
+        self
+    }
+
+    pub fn build(
+        &self,
+        comp_key: &CudaCompressionKey,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> CudaCompressedCiphertextList {
+        let packed_list =
+            comp_key.compress_ciphertexts_into_list(&self.ciphertexts, streams, gpu_index);
+        CudaCompressedCiphertextList {
+            packed_list: packed_list,
+            info: self.info.clone(),
+        }
+    }
+}
diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs
index 7f0408eddb..51e702d0c8 100644
--- a/tfhe/src/integer/gpu/ciphertext/mod.rs
+++ b/tfhe/src/integer/gpu/ciphertext/mod.rs
@@ -1,4 +1,5 @@
 pub mod boolean_value;
+pub mod compressed_ciphertext_list;
 pub mod info;
 
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
diff --git a/tfhe/src/integer/gpu/list_compression/mod.rs b/tfhe/src/integer/gpu/list_compression/mod.rs
new file mode 100644
index 0000000000..7cee096af1
--- /dev/null
+++ b/tfhe/src/integer/gpu/list_compression/mod.rs
@@ -0,0 +1 @@
+pub mod server_keys;
diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs
new file mode 100644
index 0000000000..d21a4f162c
--- /dev/null
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -0,0 +1,300 @@
+use crate::core_crypto::entities::LweCiphertext;
+use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::{
+    allocate_and_generate_new_lwe_packing_keyswitch_key,
+    par_allocate_and_generate_new_lwe_bootstrap_key, CiphertextModulusLog, GlweCiphertextCount,
+    LweBootstrapKeyOwned, LweCiphertextCount, LweCiphertextList, LweSize,
+};
+use crate::integer::ciphertext::DataKind;
+use crate::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextList;
+use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
+use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
+use crate::integer::gpu::server_key::CudaBootstrappingKey;
+use crate::integer::gpu::{
+    cuda_memcpy_async_gpu_to_gpu, unchecked_compression_compress_integer_radix_async, PBSType,
+};
+use crate::shortint::client_key::ClientKey;
+use crate::shortint::engine::ShortintEngine;
+use crate::shortint::list_compression::{CompressionKey, CompressionPrivateKeys};
+use crate::shortint::{
+    CiphertextModulus, ClassicPBSParameters, EncryptionKeyChoice, PBSParameters,
+};
+use itertools::Itertools;
+
+#[derive(Debug)]
+pub struct CudaCompressionKey {
+    pub packing_key_switching_key: CudaLwePackingKeyswitchKey<u64>,
+    pub lwe_per_glwe: LweCiphertextCount,
+    pub storage_log_modulus: CiphertextModulusLog,
+}
+
+pub struct CudaDecompressionKey {
+    pub blind_rotate_key: CudaBootstrappingKey,
+    pub lwe_per_glwe: LweCiphertextCount,
+    pub ciphertext_modulus: CiphertextModulus, //TODO: Remove this
+}
+
+impl CudaCompressionKey {
+    pub fn from_compression_key(compression_key: &CompressionKey, streams: &CudaStreams) -> Self {
+        CudaCompressionKey {
+            packing_key_switching_key: CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(
+                &compression_key.packing_key_switching_key,
+                streams,
+            ),
+            lwe_per_glwe: compression_key.lwe_per_glwe,
+            storage_log_modulus: compression_key.storage_log_modulus,
+        }
+    }
+
+    fn flatten(
+        &self,
+        vec_ciphertexts: &Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> (CudaLweCiphertextList<u64>, Vec<CudaBlockInfo>) {
+        let first_ct = &vec_ciphertexts.first().unwrap().d_blocks;
+
+        // We assume all ciphertexts will have the same lwe dimension
+        let lwe_dimension = first_ct.lwe_dimension();
+        let ciphertext_modulus = first_ct.ciphertext_modulus();
+
+        // Compute total number of lwe ciphertexts we will be handling
+        let total_num_blocks: usize = vec_ciphertexts
+            .iter()
+            .map(|x| x.d_blocks.lwe_ciphertext_count().0)
+            .sum();
+
+        let lwe_ciphertext_count = LweCiphertextCount(total_num_blocks);
+
+        let d_vec = unsafe {
+            let mut d_vec = CudaVec::new_async(
+                lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0,
+                streams,
+                gpu_index,
+            );
+            let mut offset: usize = 0;
+            for ciphertext in vec_ciphertexts {
+                // Todo: We might use copy_self_range_gpu_to_gpu_async here
+                let dest_ptr = d_vec
+                    .as_mut_c_ptr(gpu_index)
+                    .add(offset * std::mem::size_of::<u64>());
+                let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
+                cuda_memcpy_async_gpu_to_gpu(
+                    dest_ptr,
+                    ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index),
+                    size as u64,
+                    streams.ptr[gpu_index as usize],
+                    streams.gpu_indexes[gpu_index as usize],
+                );
+
+                offset += ciphertext.d_blocks.0.d_vec.len;
+            }
+
+            streams.synchronize();
+            d_vec
+        };
+
+        let flattened_ciphertexts =
+            CudaLweCiphertextList::from_cuda_vec(d_vec, lwe_ciphertext_count, ciphertext_modulus);
+
+        let info = vec_ciphertexts
+            .iter()
+            .flat_map(|x| x.info.blocks.clone())
+            .collect_vec();
+
+        (flattened_ciphertexts, info)
+    }
+
+    pub fn compress_ciphertexts_into_list(
+        &self,
+        ciphertexts: &Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> (CudaGlweCiphertextList<u64>, Vec<CudaBlockInfo>) {
+        let lwe_pksk = &self.packing_key_switching_key;
+
+        let polynomial_size = lwe_pksk.output_polynomial_size();
+        let ciphertext_modulus = lwe_pksk.ciphertext_modulus();
+        let glwe_size = lwe_pksk.output_glwe_size();
+        let lwe_size = lwe_pksk.input_key_lwe_dimension().to_lwe_size();
+        println!("compress lwe_size: {:?}", lwe_size);
+        println!("compress polynomial_size: {:?}", polynomial_size);
+
+        let first_ct_info = ciphertexts.first().unwrap().info.blocks.first().unwrap();
+        let message_modulus = first_ct_info.message_modulus;
+        let carry_modulus = first_ct_info.carry_modulus;
+
+        let num_lwes: usize = ciphertexts
+            .iter()
+            .map(|x| x.d_blocks.lwe_ciphertext_count().0)
+            .sum();
+
+        let mut output_glwe = CudaGlweCiphertextList::new(
+            glwe_size.to_glwe_dimension(),
+            polynomial_size,
+            GlweCiphertextCount(ciphertexts.len()),
+            ciphertext_modulus,
+            streams,
+        );
+
+        let (input_lwes, info) = self.flatten(ciphertexts, streams, gpu_index);
+
+        unsafe {
+            unchecked_compression_compress_integer_radix_async(
+                streams,
+                &mut output_glwe.0.d_vec,
+                &input_lwes.0.d_vec,
+                &self.packing_key_switching_key.d_vec,
+                message_modulus,
+                carry_modulus,
+                glwe_size.to_glwe_dimension(),
+                polynomial_size,
+                lwe_size.to_lwe_dimension(),
+                lwe_pksk.decomposition_base_log(),
+                lwe_pksk.decomposition_level_count(),
+                self.lwe_per_glwe.0 as u32,
+                self.storage_log_modulus.0 as u32,
+                num_lwes as u32,
+            );
+        }
+
+        (output_glwe, info)
+    }
+}
+
+impl CudaDecompressionKey {
+    pub fn unpack(
+        &self,
+        packed_list: &(CudaGlweCiphertextList<u64>, Vec<CudaBlockInfo>),
+        start_block_index: usize,
+        end_block_index: usize,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> CudaRadixCiphertext {
+        let glwe_dimension = packed_list.0.glwe_dimension();
+        let polynomial_size = packed_list.0.polynomial_size();
+        let lwe_ciphertext_count = LweCiphertextCount(end_block_index - start_block_index);
+        //let lwe_size = self.blind_rotate_key.output_lwe_dimension().to_lwe_size();
+        let lwe_size = LweSize(0);
+        println!("decompress lwe_size: {:?}", lwe_size);
+        println!(
+            "decompress lwe_ciphertext_count: {:?}",
+            lwe_ciphertext_count
+        );
+        println!("decompress polynomial_size: {:?}", polynomial_size);
+        let output_lwe = CudaLweCiphertextList::new(
+            lwe_size.to_lwe_dimension(),
+            lwe_ciphertext_count,
+            self.ciphertext_modulus,
+            streams,
+        );
+        unsafe {
+            // unchecked_compression_compress_integer_radix_async(streams,
+            //                                                    &mut output_glwe.0.d_vec,
+            //                                                    &input_lwes.0.d_vec,
+            //
+            // &self.packing_key_switching_key.d_vec,
+            // message_modulus,
+            // carry_modulus,
+            // glwe_size.to_glwe_dimension(),
+            // polynomial_size,
+            // lwe_size.to_lwe_dimension(),
+            // lwe_pksk.decomposition_base_log(),
+            // lwe_pksk.decomposition_level_count(),
+            // self.lwe_per_glwe.0 as u32,
+            // self.storage_log_modulus.0 as u32,
+            // num_lwes as u32, );
+        }
+
+        CudaRadixCiphertext {
+            d_blocks: output_lwe,
+            info: CudaRadixCiphertextInfo {
+                blocks: packed_list.1.clone(),
+            },
+        }
+    }
+}
+
+impl ClientKey {
+    pub fn new_cuda_compression_decompression_keys(
+        &self,
+        private_compression_key: &CompressionPrivateKeys,
+        streams: &CudaStreams,
+    ) -> (CudaCompressionKey, CudaDecompressionKey) {
+        let params = &private_compression_key.params;
+        let cks_params: ClassicPBSParameters = match self.parameters.pbs_parameters().unwrap() {
+            PBSParameters::PBS(a) => a,
+            PBSParameters::MultiBitPBS(_) => {
+                panic!("Compression is currently not compatible with Multi Bit PBS")
+            }
+        };
+
+        assert_eq!(
+            cks_params.encryption_key_choice,
+            EncryptionKeyChoice::Big,
+            "Compression is only compatible with ciphertext in post PBS dimension"
+        );
+
+        let packing_key_switching_key = ShortintEngine::with_thread_local_mut(|engine| {
+            allocate_and_generate_new_lwe_packing_keyswitch_key(
+                &self.large_lwe_secret_key(),
+                &private_compression_key.post_packing_ks_key,
+                params.packing_ks_base_log,
+                params.packing_ks_level,
+                params.packing_ks_key_noise_distribution,
+                self.parameters.ciphertext_modulus(),
+                &mut engine.encryption_generator,
+            )
+        });
+
+        assert!(
+            private_compression_key.params.storage_log_modulus.0
+                <= cks_params
+                    .polynomial_size
+                    .to_blind_rotation_input_modulus_log()
+                    .0,
+            "Compression parameters say to store more bits than useful"
+        );
+
+        let glwe_compression_key = CompressionKey {
+            packing_key_switching_key,
+            lwe_per_glwe: params.lwe_per_glwe,
+            storage_log_modulus: private_compression_key.params.storage_log_modulus,
+        };
+
+        let mut engine = ShortintEngine::new();
+        let h_bootstrap_key: LweBootstrapKeyOwned<u64> =
+            par_allocate_and_generate_new_lwe_bootstrap_key(
+                &private_compression_key
+                    .post_packing_ks_key
+                    .as_lwe_secret_key(),
+                &self.glwe_secret_key,
+                private_compression_key.params.br_base_log,
+                private_compression_key.params.br_level,
+                self.parameters.glwe_noise_distribution(),
+                self.parameters.ciphertext_modulus(),
+                &mut engine.encryption_generator,
+            );
+
+        let d_bootstrap_key =
+            CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams);
+
+        let blind_rotate_key = CudaBootstrappingKey::Classic(d_bootstrap_key);
+
+        let cuda_glwe_decompression_key = CudaDecompressionKey {
+            blind_rotate_key,
+            lwe_per_glwe: params.lwe_per_glwe,
+            ciphertext_modulus: self.parameters.ciphertext_modulus(),
+        };
+
+        (
+            CudaCompressionKey::from_compression_key(&glwe_compression_key, streams),
+            cuda_glwe_decompression_key,
+        )
+    }
+}
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index 654ddce257..4406b72a15 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -1,6 +1,6 @@
 pub mod ciphertext;
+pub mod list_compression;
 pub mod server_key;
-
 use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut};
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::{get_max_shared_memory, CudaStreams};
@@ -12,6 +12,7 @@ use crate::integer::{ClientKey, RadixClientKey};
 use crate::shortint::{CarryModulus, MessageModulus};
 pub use server_key::CudaServerKey;
 use std::cmp::min;
+
 use tfhe_cuda_backend::cuda_bind::*;
 
 #[repr(u32)]
@@ -32,6 +33,13 @@ pub enum PBSType {
     Classical = 1,
 }
 
+#[allow(dead_code)]
+#[repr(u32)]
+pub enum CompressionType {
+    Compress = 0,
+    Decompress = 1,
+}
+
 #[repr(u32)]
 pub enum ShiftRotateType {
     LeftShift = 0,
@@ -273,6 +281,157 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
     );
 }
 
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn unchecked_compression_compress_integer_radix_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    glwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    fp_keyswitch_key: &CudaVec<u64>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    ks_base_log: DecompositionBaseLog,
+    ks_level: DecompositionLevelCount,
+    lwe_per_glwe: u32,
+    storage_log_modulus: u32,
+    num_blocks: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array_in.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        fp_keyswitch_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_compression_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        0 as u32,
+        0 as u32,
+        0 as u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        PBSType::Classical as u32,
+        lwe_per_glwe,
+        storage_log_modulus,
+        CompressionType::Compress as u32,
+        true,
+    );
+
+    cuda_compression_compress_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        glwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
+        fp_keyswitch_key.ptr.as_ptr(),
+        num_blocks,
+        mem_ptr,
+    );
+
+    cleanup_cuda_compression_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn unchecked_compression_decompress_integer_radix_async<
+    T: UnsignedInteger,
+    B: Numeric,
+>(
+    streams: &CudaStreams,
+    lwe_out: &mut CudaVec<T>,
+    glwe_array_in: &CudaVec<T>,
+    bootstrapping_key: &CudaVec<B>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    lwe_per_glwe: u32,
+    storage_log_modulus: u32,
+    num_blocks: u32,
+    grouping_factor: LweBskGroupingFactor,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_array_in.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bootstrapping_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_compression_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        lwe_dimension.0 as u32,
+        0 as u32,
+        0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        PBSType::Classical as u32,
+        lwe_per_glwe,
+        storage_log_modulus,
+        CompressionType::Decompress as u32,
+        true,
+    );
+
+    cuda_compression_decompress_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        lwe_out.as_mut_c_ptr(0),
+        glwe_array_in.as_c_ptr(0),
+        bootstrapping_key.ptr.as_ptr(),
+        mem_ptr,
+    );
+
+    cleanup_cuda_compression_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
+
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///