Skip to content
This repository has been archived by the owner on Aug 23, 2023. It is now read-only.

Commit

Permalink
[src] Fix leaking CudaFst object's device memory (kaldi-asr#4548)
Browse files Browse the repository at this point in the history
BatchedThreadedNnet3CudaOnlinePipeline never freed a CudaFst object
it initialized. Freeing it caused an error returned from cudaFree
(with our allocator disabled).

The root cause of this problem was copying of the CudaFst in
CudaDecoder's constructor. The code has been changed so that
CudaDecoder only stores a reference to CudaFst, and the object
itself is uniquely owned by the pipeline.

This allowed folding of separate CudaFst::Initialize and
CudaFst::Finalize methods into its respective constructor and
destructor.

There is also a proof-of-concept unique_device_ptr, a specialization
of std::unique_ptr for device-allocated memory. The type is declared
privately in, and only used by CudaFst. The Finalize method had been
removed entirely, and the memory is freed in the object's destructor,
as the members are destroyed.
  • Loading branch information
kkm000 authored Jun 2, 2021
1 parent 2d57eb9 commit 497e284
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 166 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::AllocateAndInitializeData(
feature_pipelines_.resize(config_.num_channels);
}

// Decoder
cuda_fst_ = std::make_shared<CudaFst>();
cuda_fst_->Initialize(decode_fst, trans_model_);
// Decoder.
cuda_fst_ = std::make_unique<CudaFst>(decode_fst, trans_model_);
cuda_decoder_.reset(new CudaDecoder(*cuda_fst_, config_.decoder_opts,
max_batch_size_, config_.num_channels));
if (config_.num_decoder_copy_threads > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
// HCLG graph : CudaFst object is a host object, but contains
// data stored in
// GPU memory
std::shared_ptr<CudaFst> cuda_fst_;
std::unique_ptr<CudaFst> cuda_fst_;
std::unique_ptr<CudaDecoder> cuda_decoder_;

std::unique_ptr<ThreadPoolLight> thread_pool_;
Expand Down
13 changes: 9 additions & 4 deletions src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@
#define SLEEP_BACKOFF_S ((double)SLEEP_BACKOFF_NS / 1e9)

#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"

#include <memory>

#include <nvToolsExt.h>

#include "base/kaldi-utils.h"
#include "cudadecoder/cuda-fst.h"

// This pipeline is deprecated and will be removed. Please switch to
// batched-threaded-nnet3-cuda-pipeline2
Expand All @@ -45,7 +50,7 @@ void BatchedThreadedNnet3CudaPipeline::Initialize(

am_nnet_ = &am_nnet;
trans_model_ = &trans_model;
cuda_fst_.Initialize(decode_fst, trans_model_);
cuda_fst_ = std::make_unique<CudaFst>(decode_fst, trans_model_);

feature_info_ = new OnlineNnet2FeaturePipelineInfo(config_.feature_opts);
feature_info_->ivector_extractor_info.use_most_recent_ivector = true;
Expand Down Expand Up @@ -93,7 +98,7 @@ void BatchedThreadedNnet3CudaPipeline::Finalize() {
thread_contexts_[i].join();
}

cuda_fst_.Finalize();
cuda_fst_.reset();

delete feature_info_;
delete work_pool_;
Expand Down Expand Up @@ -819,7 +824,7 @@ void BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int threadId) {
<< " num_channels=" << config_.num_channels;
// Data structures that are reusable across decodes but unique to each
// thread
CudaDecoder cuda_decoder(cuda_fst_, config_.decoder_opts,
CudaDecoder cuda_decoder(*cuda_fst_, config_.decoder_opts,
config_.max_batch_size, config_.num_channels);
nnet3::NnetBatchComputer computer(config_.compute_opts, am_nnet_->GetNnet(),
am_nnet_->Priors());
Expand Down Expand Up @@ -916,7 +921,7 @@ void BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int threadId) {
// outs, and cleans up data structures
PostDecodeProcessing(cuda_decoder, channel_state, decodables, tasks);

} catch (CudaDecoderException e) {
} catch (CudaDecoderException &e) {
// Code to catch errors. Most errors are
// unrecoverable but a user can mark them
// recoverable which will cancel the entire
Expand Down
3 changes: 2 additions & 1 deletion src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#define KALDI_CUDA_DECODER_BATCHED_THREADED_NNET3_CUDA_PIPELINE_H_

#include <atomic>
#include <memory>
#include <thread>

#include "cudadecoder/cuda-decoder.h"
Expand Down Expand Up @@ -364,7 +365,7 @@ class [[deprecated]] BatchedThreadedNnet3CudaPipeline {

BatchedThreadedNnet3CudaPipelineConfig config_;

CudaFst cuda_fst_;
std::unique_ptr<CudaFst> cuda_fst_;
const TransitionModel *trans_model_;
const nnet3::AmNnetSimple *am_nnet_;
nnet3::DecodableNnetSimpleLoopedInfo *decodable_info_;
Expand Down
35 changes: 16 additions & 19 deletions src/cudadecoder/cuda-decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ namespace cuda_decoder {

CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
int32 nlanes, int32 nchannels)
: word_syms_(NULL),
: word_syms_(nullptr),
generate_partial_hypotheses_(false),
endpointing_(false),
partial_traceback_(false),
Expand All @@ -49,7 +49,7 @@ CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
nchannels_(nchannels),
channel_lock_(nchannels + 1),
extra_cost_min_delta_(0.0f),
thread_pool_(NULL),
thread_pool_(nullptr),
n_threads_used_(0),
n_h2h_task_not_done_(0),
n_init_decoding_h2h_task_not_done_(0),
Expand All @@ -58,8 +58,8 @@ CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
// Static asserts on constants
CheckStaticAsserts();
// Runtime asserts
KALDI_ASSERT(nlanes > 0);
KALDI_ASSERT(nchannels > 0);
KALDI_ASSERT(nlanes_ > 0);
KALDI_ASSERT(nchannels_ > 0);
KALDI_ASSERT(nlanes_ <= nchannels_);
// All GPU work in decoder will be sent to compute_st_
cudaStreamCreate(&compute_st_);
Expand Down Expand Up @@ -244,8 +244,8 @@ void CudaDecoder::InitDeviceData() {
void CudaDecoder::InitHostData() {}

void CudaDecoder::AllocateDeviceKernelParams() {
h_device_params_ = new DeviceParams();
h_kernel_params_ = new KernelParams();
h_device_params_ = std::make_unique<DeviceParams>();
h_kernel_params_ = std::make_unique<KernelParams>();
}

void CudaDecoder::InitDeviceParams() {
Expand Down Expand Up @@ -281,12 +281,12 @@ void CudaDecoder::InitDeviceParams() {
h_device_params_->d_main_q_arc_offsets = d_main_q_arc_offsets_.GetView();
h_device_params_->d_hashmap_values = d_hashmap_values_.GetView();
h_device_params_->d_histograms = d_histograms_.GetView();
h_device_params_->d_arc_e_offsets = fst_.d_e_offsets_;
h_device_params_->d_arc_ne_offsets = fst_.d_ne_offsets_;
h_device_params_->d_arc_pdf_ilabels = fst_.d_arc_pdf_ilabels_;
h_device_params_->d_arc_weights = fst_.d_arc_weights_;
h_device_params_->d_arc_nextstates = fst_.d_arc_nextstates_;
h_device_params_->d_fst_final_costs = fst_.d_final_;
h_device_params_->d_arc_e_offsets = fst_.d_e_offsets_.get();
h_device_params_->d_arc_ne_offsets = fst_.d_ne_offsets_.get();
h_device_params_->d_arc_pdf_ilabels = fst_.d_arc_pdf_ilabels_.get();
h_device_params_->d_arc_weights = fst_.d_arc_weights_.get();
h_device_params_->d_arc_nextstates = fst_.d_arc_nextstates_.get();
h_device_params_->d_fst_final_costs = fst_.d_final_.get();
h_device_params_->default_beam = default_beam_;
h_device_params_->lattice_beam = lattice_beam_;
h_device_params_->main_q_capacity = main_q_capacity_;
Expand All @@ -296,7 +296,7 @@ void CudaDecoder::InitDeviceParams() {
h_device_params_->nstates = fst_.num_states_;
h_device_params_->init_state = fst_.Start();
KALDI_ASSERT(h_device_params_->init_state != fst::kNoStateId);
h_device_params_->init_cost = StdWeight::One().Value();
h_device_params_->init_cost = CudaFst::Weight::One().Value();
h_device_params_->hashmap_capacity = hashmap_capacity_;
h_device_params_->max_active = max_active_;
// For the first static_beam_q_length elements of the queue, we will
Expand All @@ -318,7 +318,7 @@ void CudaDecoder::InitDeviceParams() {
// Those cannot be used at the same time
h_device_params_->h_list_final_tokens_in_main_q =
h_list_final_tokens_in_main_q_.GetView();
h_device_params_->fst_zero = StdWeight::Zero().Value();
h_device_params_->fst_zero = CudaFst::Weight::Zero().Value();
}

CudaDecoder::~CudaDecoder() noexcept(false) {
Expand Down Expand Up @@ -350,9 +350,6 @@ CudaDecoder::~CudaDecoder() noexcept(false) {
KALDI_DECODER_CUDA_API_CHECK_ERROR(
cudaEventDestroy(concatenated_data_ready_evt_));
KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(lane_offsets_ready_evt_));

delete h_kernel_params_;
delete h_device_params_;
}

void CudaDecoder::ComputeInitialChannel() {
Expand Down Expand Up @@ -1072,7 +1069,7 @@ void CudaDecoder::GetBestPredecessor(int32 ichannel, int32 curr_token_idx,
int32 offset, size;
std::tie(offset, size) = token.GetSameFSTStateTokensList();
bool found_best = false;
for (auto i = 0; i < size; ++i) {
for (int32 i = 0; i < size; ++i) {
KALDI_ASSERT(
(offset + i) <
h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
Expand Down Expand Up @@ -1243,7 +1240,7 @@ void CudaDecoder::AddFinalTokensToLattice(
const CostType best_cost = h_all_argmin_cost_[ichannel].second;
// Iterating through tokens associated with a final state in the last
// frame
for (auto &p : h_all_final_tokens_list_[ichannel]) {
for (const auto &p : h_all_final_tokens_list_[ichannel]) {
// This final token has a final cost of final_token_cost
CostType final_token_cost = p.second;
// This token has possibly an extra cost compared to the best
Expand Down
20 changes: 13 additions & 7 deletions src/cudadecoder/cuda-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,15 @@ class CudaDecoder {
// Using nlanes=2500 in that configuration would first not be possible
// (out of memory), but also not necessary. Increasing the number of
// lanes is only useful if it increases performance. If the GPU is
// saturated at nlanes=200, you should not increase that number
// saturated at nlanes=200, you should not increase that number.
//
///\param[in] fst A CudaFst instance. Not owned, must survive this object.
///\param[in] config
///\param[in] nlanes
///\param[in] nchannels
CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config, int32 nlanes,
int32 nchannels);

// Reads the config from config
void ReadConfig(const CudaDecoderConfig &config);
// Special constructor for nlanes = nchannels. Here for the non-advanced
// user Here we can consider nchannels = batch size. If we want to
// decode 10 utterances at a time, we can use nchannels = 10
Expand All @@ -211,6 +214,10 @@ class CudaDecoder {
: CudaDecoder(fst, config, nchannels, nchannels) {}
virtual ~CudaDecoder() noexcept(false);

KALDI_DISALLOW_COPY_AND_ASSIGN(CudaDecoder);

// Reads the config from config
void ReadConfig(const CudaDecoderConfig &config);
// InitDecoding initializes the decoding, and should only be used if you
// intend to call AdvanceDecoding() on the channels listed in channels
void InitDecoding(const std::vector<ChannelId> &channels);
Expand Down Expand Up @@ -511,7 +518,7 @@ class CudaDecoder {

// The CudaFst data structure contains the FST graph
// in the CSR format, on both the GPU and CPU memory
const CudaFst fst_;
const CudaFst& fst_;

// Counters used by a decoder lane
// Contains all the single values generated during computation,
Expand Down Expand Up @@ -698,8 +705,8 @@ class CudaDecoder {
// i.e. memory address of the main_q for instance
// KernelParams contains information that can change.
// For instance which channel is executing on which lane
DeviceParams *h_device_params_;
KernelParams *h_kernel_params_;
std::unique_ptr<DeviceParams> h_device_params_;
std::unique_ptr<KernelParams> h_kernel_params_;
std::vector<ChannelId> channel_to_compute_;
int32 nlanes_used_; // number of lanes used in h_kernel_params_
// Initial lane
Expand Down Expand Up @@ -930,7 +937,6 @@ class CudaDecoder {
std::unordered_map<LatticeStateInternalId, RawLatticeState>
*prev_f_raw_lattice_state,
std::unordered_set<int32> *f_arc_idx_added);
KALDI_DISALLOW_COPY_AND_ASSIGN(CudaDecoder);
};

} // end namespace cuda_decoder
Expand Down
Loading

0 comments on commit 497e284

Please sign in to comment.