Skip to content

Commit

Permalink
chore(gpu): start using a struct to pass data across rust/c++
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Jan 20, 2025
1 parent cc85c44 commit 01a1ff1
Show file tree
Hide file tree
Showing 48 changed files with 1,152 additions and 925 deletions.
3 changes: 3 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);

void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);

void synchronize_streams(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count);

uint32_t cuda_is_available();

void *cuda_malloc(uint64_t size, uint32_t gpu_index);
Expand Down
21 changes: 16 additions & 5 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };

extern "C" {

typedef struct {
void *ptr;
uint64_t *degrees;
uint64_t *noise_levels;
uint32_t num_radix_blocks;
uint32_t lwe_dimension;
} CudaRadixCiphertextFFI;

void scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
Expand Down Expand Up @@ -258,9 +267,11 @@ void scratch_cuda_integer_radix_cmux_kb_64(

void cuda_cmux_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count);
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);

void cleanup_cuda_integer_radix_cmux(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down Expand Up @@ -439,8 +450,8 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
void *const *bsks, void *const *ksks);

void cleanup_cuda_integer_abs_inplace(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down
47 changes: 26 additions & 21 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "integer.h"
#include "keyswitch.h"
#include "pbs/programmable_bootstrap.cuh"
#include "radix_ciphertext.h"
#include <cassert>
#include <cmath>
#include <functional>
Expand Down Expand Up @@ -2963,9 +2964,9 @@ template <typename Torus> struct int_cmux_buffer {
int_radix_lut<Torus> *predicate_lut;
int_radix_lut<Torus> *message_extract_lut;

Torus *buffer_in;
Torus *buffer_out;
Torus *condition_array;
CudaRadixCiphertextFFI *buffer_in = new CudaRadixCiphertextFFI;
CudaRadixCiphertextFFI *buffer_out = new CudaRadixCiphertextFFI;
CudaRadixCiphertextFFI *condition_array = new CudaRadixCiphertextFFI;

int_radix_params params;

Expand All @@ -2978,15 +2979,18 @@ template <typename Torus> struct int_cmux_buffer {
this->params = params;

if (allocate_gpu_memory) {
Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);

buffer_in =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
buffer_out =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
condition_array =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
printf("Here\n");
create_trivial_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], buffer_in, 2 * num_radix_blocks,
params.big_lwe_dimension);
printf("Here\n");
create_trivial_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], buffer_out, 2 * num_radix_blocks,
params.big_lwe_dimension);
printf("Here\n");
create_trivial_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], condition_array, 2 * num_radix_blocks,
params.big_lwe_dimension);

auto lut_f = [predicate_lut_f](Torus block, Torus condition) -> Torus {
return predicate_lut_f(condition) ? 0 : block;
Expand Down Expand Up @@ -3047,9 +3051,12 @@ template <typename Torus> struct int_cmux_buffer {
message_extract_lut->release(streams, gpu_indexes, gpu_count);
delete message_extract_lut;

cuda_drop_async(buffer_in, streams[0], gpu_indexes[0]);
cuda_drop_async(buffer_out, streams[0], gpu_indexes[0]);
cuda_drop_async(condition_array, streams[0], gpu_indexes[0]);
release_radix_ciphertext_data(streams[0], gpu_indexes[0], buffer_in);
delete buffer_in;
release_radix_ciphertext_data(streams[0], gpu_indexes[0], buffer_out);
delete buffer_out;
release_radix_ciphertext_data(streams[0], gpu_indexes[0], condition_array);
delete condition_array;
}
};

Expand Down Expand Up @@ -4351,7 +4358,7 @@ template <typename Torus> struct int_abs_buffer {
int_sc_prop_memory<Torus> *scp_mem;
int_bitop_buffer<Torus> *bitxor_mem;

Torus *mask;
CudaRadixCiphertextFFI *mask;
int_abs_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
Expand All @@ -4372,11 +4379,9 @@ template <typename Torus> struct int_abs_buffer {
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params,
num_radix_blocks, allocate_gpu_memory);

uint32_t lwe_size = params.big_lwe_dimension + 1;
uint32_t lwe_size_bytes = lwe_size * sizeof(Torus);

mask = (Torus *)cuda_malloc_async(num_radix_blocks * lwe_size_bytes,
streams[0], gpu_indexes[0]);
create_trivial_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
mask, num_radix_blocks,
params.big_lwe_dimension);
}
}

Expand Down
31 changes: 31 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef CUDA_RADIX_CIPHERTEXT_H
#define CUDA_RADIX_CIPHERTEXT_H

#include "device.h"
#include "integer.h"

template <typename Torus>
void create_trivial_radix_ciphertext_async(cudaStream_t const stream,
uint32_t const gpu_index,
CudaRadixCiphertextFFI *output_radix,
uint32_t num_radix_blocks,
uint32_t lwe_dimension);

// end_lwe_index is inclusive
template <typename Torus>
void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
const CudaRadixCiphertextFFI *input_radix,
uint32_t start_lwe_index,
uint32_t end_lwe_index);

template <typename Torus>
void copy_radix_ciphertext_to_larger_output_slice_async(
cudaStream_t const stream, uint32_t const gpu_index,
CudaRadixCiphertextFFI *output_radix,
const CudaRadixCiphertextFFI *input_radix, uint32_t output_start_lwe_index);

void release_radix_ciphertext_data(cudaStream_t const stream,
uint32_t const gpu_index,
CudaRadixCiphertextFFI *data);

#endif // CUDA_RADIX_CIPHERTEXT_H
16 changes: 7 additions & 9 deletions backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef CUDA_LINALG_H_
#define CUDA_LINALG_H_

#include "integer/integer.h"
#include <stdint.h>

extern "C" {
Expand All @@ -14,16 +15,13 @@ void cuda_negate_lwe_ciphertext_vector_64(
void const *lwe_array_in, const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in_1, void const *lwe_array_in_2,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_1,
CudaRadixCiphertextFFI const *lwe_array_in_2);
void cuda_add_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in_1, void const *lwe_array_in_2,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);

void *stream, uint32_t gpu_index, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_1,
CudaRadixCiphertextFFI const *lwe_array_in_2);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
Expand Down
7 changes: 7 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaStreamSynchronize(stream));
}

void synchronize_streams(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count) {
for (uint i = 0; i < gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}

// Determine if a CUDA device is available at runtime
uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }

Expand Down
9 changes: 4 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
void *const *bsks, void *const *ksks) {

auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;

host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, static_cast<uint64_t *>(ct), bsks,
(uint64_t **)(ksks), mem, is_signed,
num_blocks);
gpu_count, ct, bsks, (uint64_t **)(ksks), mem,
is_signed);
}

void cleanup_cuda_integer_abs_inplace(void *const *streams,
Expand Down
60 changes: 49 additions & 11 deletions backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "integer/negation.cuh"
#include "integer/radix_ciphertext.h"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "pbs/programmable_bootstrap.h"
Expand All @@ -32,16 +33,15 @@ __host__ void scratch_cuda_integer_abs_kb(
}

template <typename Torus>
__host__ void
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *ct, void *const *bsks,
uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
bool is_signed, uint32_t num_blocks) {
__host__ void legacy_host_integer_abs_kb_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *ct, void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed, uint32_t num_blocks) {
if (!is_signed)
return;

auto radix_params = mem_ptr->params;
auto mask = mem_ptr->mask;
auto mask = (Torus *)(mem_ptr->mask->ptr);

auto big_lwe_dimension = radix_params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
Expand All @@ -52,20 +52,58 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);

host_integer_radix_arithmetic_scalar_shift_kb_inplace(
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
radix_params.big_lwe_dimension, num_blocks);
legacy_host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
radix_params.big_lwe_dimension, num_blocks);

uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
bsks, ksks, num_blocks, requested_flag, uses_carry);

host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
ct, mem_ptr->bitxor_mem, bsks, ksks,
num_blocks);
}

template <typename Torus>
__host__ void
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
if (!is_signed)
return;

auto mask = mem_ptr->mask;

uint32_t num_bits_in_ciphertext =
(31 - __builtin_clz(mem_ptr->params.message_modulus)) *
ct->num_radix_blocks;

copy_radix_ciphertext_to_larger_output_slice_async<Torus>(
streams[0], gpu_indexes[0], mask, ct, 0);

host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)(mask->ptr),
num_bits_in_ciphertext - 1, mem_ptr->arithmetic_scalar_shift_mem, bsks,
ksks, ct->num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct);

uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)(ct->ptr), nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ct->num_radix_blocks, requested_flag,
uses_carry);

host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count,
(Torus *)(ct->ptr), (Torus *)(mask->ptr),
(Torus *)(ct->ptr), mem_ptr->bitxor_mem,
bsks, ksks, ct->num_radix_blocks);
}

#endif // TFHE_RS_ABS_CUH
19 changes: 8 additions & 11 deletions backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,16 @@ void scratch_cuda_integer_radix_cmux_kb_64(

void cuda_cmux_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count) {
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {

host_integer_radix_cmux_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_condition),
static_cast<const uint64_t *>(lwe_array_true),
static_cast<const uint64_t *>(lwe_array_false),
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),

lwe_ciphertext_count);
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_condition, lwe_array_true, lwe_array_false,
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
}

void cleanup_cuda_integer_radix_cmux(void *const *streams,
Expand Down
Loading

0 comments on commit 01a1ff1

Please sign in to comment.