Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
pdroalves committed Jul 25, 2024
1 parent ef33376 commit a4b5851
Show file tree
Hide file tree
Showing 33 changed files with 1,209 additions and 354 deletions.
16 changes: 16 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/functions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef CUDA_FUNCTIONS_H_
#define CUDA_FUNCTIONS_H_

#include <cstdint>
#include "polynomial/parameters.cuh"
#include "polynomial/functions.cuh"

extern "C" {
void cuda_glwe_sample_extract_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array_out,
void *glwe_array_in, uint32_t *nth_array, uint32_t num_samples, uint32_t glwe_dimension,
uint32_t
polynomial_size);
}

#endif
59 changes: 59 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ enum COMPARISON_TYPE {
MIN = 7,
};

enum COMPRESSION_MODE {
COMPRESS = 0,
DECOMPRESS = 1,
};

enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
Expand Down Expand Up @@ -203,6 +208,29 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);

void scratch_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode,
bool allocate_gpu_memory);

void cuda_compression_compress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
int8_t *mem_ptr);

void cuda_compression_decompress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
int8_t *mem_ptr);

void cleanup_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_radix_bitop_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
Expand Down Expand Up @@ -792,6 +820,37 @@ template <typename Torus> struct int_radix_lut {
}
};

template <typename Torus> struct int_compression {
COMPRESSION_MODE mode;
int_radix_params params;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;

Torus *tmp_lwe_shifted;

int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, COMPRESSION_MODE mode,
bool allocate_gpu_memory) {
this->mode = mode;
this->params = params;
this->lwe_per_glwe = lwe_per_glwe;
this->storage_log_modulus = storage_log_modulus;

if (allocate_gpu_memory) {
int glwe_accumulator_size =
(params.glwe_dimension + 1) * params.polynomial_size;
tmp_lwe_shifted = (Torus *)cuda_malloc_async(
num_radix_blocks * (params.big_lwe_dimension + 1), streams[0],
gpu_indexes[0]);
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_lwe_shifted, streams[0], gpu_indexes[0]);
}
};
template <typename Torus> struct int_bit_extract_luts_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
Expand Down
1 change: 0 additions & 1 deletion backends/tfhe-cuda-backend/cuda/include/keyswitch.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ void cuda_fp_keyswitch_lwe_list_to_glwe_64(
void *fp_ksk_array, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
}

#endif // CNCRT_KS_H_
14 changes: 0 additions & 14 deletions backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
Expand Down
4 changes: 2 additions & 2 deletions backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
Expand Down Expand Up @@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
GET_VARIANT_ELEMENT(lwe_input_indexes, i);

// Compute Keyswitch
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
host_keyswitch_lwe_ciphertext_vector<Torus>(
streams[i], gpu_indexes[i], current_lwe_array_out,
current_lwe_output_indexes, current_lwe_array_in,
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
Expand Down
32 changes: 28 additions & 4 deletions backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
#define CNCRT_TORUS_CUH

#include "types/int128.cuh"
#include "utils/kernel_dimensions.cuh"
#include <limits>
#include "device.h"

template <typename T>
__device__ inline void typecast_double_to_torus(double x, T &r) {
Expand Down Expand Up @@ -39,19 +41,41 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
}

template <typename T>
__device__ __forceinline__ void modulus_switch(T input, T &output,
uint32_t log_modulus) {
__device__ __forceinline__ void apply_modulus_switch(T input, T &output,
uint32_t log_modulus) {
constexpr uint32_t BITS = sizeof(T) * 8;

output = input + (((T)1) << (BITS - log_modulus - 1));
output >>= (BITS - log_modulus);
}

template <typename T>
__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
__device__ __forceinline__ T apply_modulus_switch(T input,
uint32_t log_modulus) {
T output;
modulus_switch(input, output, log_modulus);
apply_modulus_switch(input, output, log_modulus);
return output;
}

template <typename Torus>
__global__ void apply_modulus_switch_inplace(Torus *array, int size,
uint32_t log_modulus) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < size) {
array[tid] = apply_modulus_switch(array[tid], log_modulus);
}
}

template <typename Torus>
__host__ void modulus_switch(cudaStream_t stream, uint32_t gpu_index,
Torus *array, int size, uint32_t log_modulus) {

int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);

apply_modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(
array, size, log_modulus);
check_cuda_error(cudaGetLastError());
}

#endif // CNCRT_TORUS_H
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#include "compression.cuh"

void scratch_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_lwes,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode,
bool allocate_gpu_memory) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);

scratch_cuda_compression_integer_radix_ciphertext_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_compression<uint64_t> **)mem_ptr, num_lwes, params, lwe_per_glwe,
storage_log_modulus, mode, allocate_gpu_memory);
}
void cuda_compression_compress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_lwes,
int8_t *mem_ptr) {

host_integer_compression_compress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_lwes,
(int_compression<uint64_t> *)mem_ptr);
}
void cuda_compression_decompress_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_out,
void *glwe_array_in, void **bsks, int8_t *mem_ptr) {

host_integer_compression_decompress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_out),
static_cast<uint64_t *>(glwe_array_in),bsks,
(int_compression<uint64_t> *)mem_ptr);
}

void cleanup_cuda_compression_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {

int_compression<uint64_t> *mem_ptr =
(int_compression<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#ifndef CUDA_INTEGER_COMPRESSION_CUH
#define CUDA_INTEGER_COMPRESSION_CUH

#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/integer.cuh"
#include "linearalgebra/multiplication.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"

template <typename Torus>
__host__ void host_integer_compression_compress(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *glwe_array_out, Torus *lwe_array_in, Torus **fp_ksk,
uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
auto params = mem_ptr->params;

// Shift
auto lwe_shifted = mem_ptr->tmp_lwe_shifted;
host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
lwe_array_in, (uint64_t)params.message_modulus,
params.big_lwe_dimension, num_lwes);

uint32_t lwe_in_size = params.big_lwe_dimension + 1;
uint32_t glwe_out_size = (params.glwe_dimension + 1) * params.polynomial_size;
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe;

// Keyswitch LWEs to GLWE
for (int i = 0; i < num_glwes; i++) {
auto lwe_subset = lwe_shifted + i * lwe_in_size;
auto glwe_out = glwe_array_out + i * glwe_out_size;

host_fp_keyswitch_lwe_list_to_glwe(
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
params.big_lwe_dimension, params.glwe_dimension, params.polynomial_size,
params.ks_base_log, params.ks_level, mem_ptr->lwe_per_glwe);
}

// Modulus switch
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(glwe_out_size, 512, num_blocks, num_threads);
apply_modulus_switch_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
glwe_array_out, num_glwes * glwe_out_size, mem_ptr->storage_log_modulus);
}


template <typename Torus>
__host__ void host_integer_compression_decompress(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_out, Torus *glwe_array_in, void **bsks,
int_compression<Torus> *mem_ptr) {

}

template <typename Torus>
__host__ void scratch_cuda_compression_integer_radix_ciphertext_64(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_compression<Torus> **mem_ptr, uint32_t num_lwes,
int_radix_params params, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, COMPRESSION_MODE mode,
bool allocate_gpu_memory) {

*mem_ptr = new int_compression<Torus>(
streams, gpu_indexes, gpu_count, params, num_lwes, lwe_per_glwe,
storage_log_modulus, mode, allocate_gpu_memory);
}
#endif
24 changes: 12 additions & 12 deletions backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {

host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(cleartext_array_in),
input_lwe_dimension,
input_lwe_ciphertext_count);
host_cleartext_vec_multiplication(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
Expand Down Expand Up @@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {

host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(cleartext_array_in),
input_lwe_dimension,
input_lwe_ciphertext_count);
host_cleartext_vec_multiplication(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
Loading

0 comments on commit a4b5851

Please sign in to comment.