[src] Fix leaking CudaFst object's device memory (kaldi-asr#4548)

BatchedThreadedNnet3CudaOnlinePipeline never freed a CudaFst object it initialized. Freeing it caused an error returned from cudaFree (with our allocator disabled). The root cause of this problem was copying of the CudaFst in CudaDecoder's constructor. The code has been changed so that CudaDecoder only stores a reference to CudaFst, and the object itself is uniquely owned by the pipeline. This allowed folding of separate CudaFst::Initialize and CudaFst::Finalize methods into its respective constructor and destructor. There is also a proof-of-concept unique_device_ptr, a specialization of std::unique_ptr for device-allocated memory. The type is declared privately in, and only used by CudaFst. The Finalize method had been removed entirely, and the memory is freed in the object's destructor, as the members are destroyed.
goodatlas · Jun 2, 2021 · 497e284 · 497e284
1 parent 2d57eb9
commit 497e284
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 166 deletions.
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -107,9 +107,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::AllocateAndInitializeData(
     feature_pipelines_.resize(config_.num_channels);
   }
 
-  // Decoder
-  cuda_fst_ = std::make_shared<CudaFst>();
-  cuda_fst_->Initialize(decode_fst, trans_model_);
+  // Decoder.
+  cuda_fst_ = std::make_unique<CudaFst>(decode_fst, trans_model_);
   cuda_decoder_.reset(new CudaDecoder(*cuda_fst_, config_.decoder_opts,
                                       max_batch_size_, config_.num_channels));
   if (config_.num_decoder_copy_threads > 0) {

diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
@@ -418,7 +418,7 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
   // HCLG graph : CudaFst object is a host object, but contains
   // data stored in
   // GPU memory
-  std::shared_ptr<CudaFst> cuda_fst_;
+  std::unique_ptr<CudaFst> cuda_fst_;
   std::unique_ptr<CudaDecoder> cuda_decoder_;
 
   std::unique_ptr<ThreadPoolLight> thread_pool_;

diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -23,8 +23,13 @@
 #define SLEEP_BACKOFF_S ((double)SLEEP_BACKOFF_NS / 1e9)
 
 #include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
+
+#include <memory>
+
 #include <nvToolsExt.h>
+
 #include "base/kaldi-utils.h"
+#include "cudadecoder/cuda-fst.h"
 
 // This pipeline is deprecated and will be removed. Please switch to
 // batched-threaded-nnet3-cuda-pipeline2
@@ -45,7 +50,7 @@ void BatchedThreadedNnet3CudaPipeline::Initialize(
 
   am_nnet_ = &am_nnet;
   trans_model_ = &trans_model;
-  cuda_fst_.Initialize(decode_fst, trans_model_);
+  cuda_fst_ = std::make_unique<CudaFst>(decode_fst, trans_model_);
 
   feature_info_ = new OnlineNnet2FeaturePipelineInfo(config_.feature_opts);
   feature_info_->ivector_extractor_info.use_most_recent_ivector = true;
@@ -93,7 +98,7 @@ void BatchedThreadedNnet3CudaPipeline::Finalize() {
     thread_contexts_[i].join();
   }
 
-  cuda_fst_.Finalize();
+  cuda_fst_.reset();
 
   delete feature_info_;
   delete work_pool_;
@@ -819,7 +824,7 @@ void BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int threadId) {
             << " num_channels=" << config_.num_channels;
   // Data structures that are reusable across decodes but unique to each
   // thread
-  CudaDecoder cuda_decoder(cuda_fst_, config_.decoder_opts,
+  CudaDecoder cuda_decoder(*cuda_fst_, config_.decoder_opts,
                            config_.max_batch_size, config_.num_channels);
   nnet3::NnetBatchComputer computer(config_.compute_opts, am_nnet_->GetNnet(),
                                     am_nnet_->Priors());
@@ -916,7 +921,7 @@ void BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int threadId) {
         // outs, and cleans up data structures
         PostDecodeProcessing(cuda_decoder, channel_state, decodables, tasks);
 
-      } catch (CudaDecoderException e) {
+      } catch (CudaDecoderException &e) {
         // Code to catch errors.  Most errors are
         // unrecoverable but a user can mark them
         // recoverable which will cancel the entire

diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
@@ -19,6 +19,7 @@
 #define KALDI_CUDA_DECODER_BATCHED_THREADED_NNET3_CUDA_PIPELINE_H_
 
 #include <atomic>
+#include <memory>
 #include <thread>
 
 #include "cudadecoder/cuda-decoder.h"
@@ -364,7 +365,7 @@ class [[deprecated]] BatchedThreadedNnet3CudaPipeline {
 
   BatchedThreadedNnet3CudaPipelineConfig config_;
 
-  CudaFst cuda_fst_;
+  std::unique_ptr<CudaFst> cuda_fst_;
   const TransitionModel *trans_model_;
   const nnet3::AmNnetSimple *am_nnet_;
   nnet3::DecodableNnetSimpleLoopedInfo *decodable_info_;

diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
@@ -39,7 +39,7 @@ namespace cuda_decoder {
 
 CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
                          int32 nlanes, int32 nchannels)
-    : word_syms_(NULL),
+    : word_syms_(nullptr),
       generate_partial_hypotheses_(false),
       endpointing_(false),
       partial_traceback_(false),
@@ -49,7 +49,7 @@ CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
       nchannels_(nchannels),
       channel_lock_(nchannels + 1),
       extra_cost_min_delta_(0.0f),
-      thread_pool_(NULL),
+      thread_pool_(nullptr),
       n_threads_used_(0),
       n_h2h_task_not_done_(0),
       n_init_decoding_h2h_task_not_done_(0),
@@ -58,8 +58,8 @@ CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
   // Static asserts on constants
   CheckStaticAsserts();
   // Runtime asserts
-  KALDI_ASSERT(nlanes > 0);
-  KALDI_ASSERT(nchannels > 0);
+  KALDI_ASSERT(nlanes_ > 0);
+  KALDI_ASSERT(nchannels_ > 0);
   KALDI_ASSERT(nlanes_ <= nchannels_);
   // All GPU work in decoder will be sent to compute_st_
   cudaStreamCreate(&compute_st_);
@@ -244,8 +244,8 @@ void CudaDecoder::InitDeviceData() {
 void CudaDecoder::InitHostData() {}
 
 void CudaDecoder::AllocateDeviceKernelParams() {
-  h_device_params_ = new DeviceParams();
-  h_kernel_params_ = new KernelParams();
+  h_device_params_ = std::make_unique<DeviceParams>();
+  h_kernel_params_ = std::make_unique<KernelParams>();
 }
 
 void CudaDecoder::InitDeviceParams() {
@@ -281,12 +281,12 @@ void CudaDecoder::InitDeviceParams() {
   h_device_params_->d_main_q_arc_offsets = d_main_q_arc_offsets_.GetView();
   h_device_params_->d_hashmap_values = d_hashmap_values_.GetView();
   h_device_params_->d_histograms = d_histograms_.GetView();
-  h_device_params_->d_arc_e_offsets = fst_.d_e_offsets_;
-  h_device_params_->d_arc_ne_offsets = fst_.d_ne_offsets_;
-  h_device_params_->d_arc_pdf_ilabels = fst_.d_arc_pdf_ilabels_;
-  h_device_params_->d_arc_weights = fst_.d_arc_weights_;
-  h_device_params_->d_arc_nextstates = fst_.d_arc_nextstates_;
-  h_device_params_->d_fst_final_costs = fst_.d_final_;
+  h_device_params_->d_arc_e_offsets = fst_.d_e_offsets_.get();
+  h_device_params_->d_arc_ne_offsets = fst_.d_ne_offsets_.get();
+  h_device_params_->d_arc_pdf_ilabels = fst_.d_arc_pdf_ilabels_.get();
+  h_device_params_->d_arc_weights = fst_.d_arc_weights_.get();
+  h_device_params_->d_arc_nextstates = fst_.d_arc_nextstates_.get();
+  h_device_params_->d_fst_final_costs = fst_.d_final_.get();
   h_device_params_->default_beam = default_beam_;
   h_device_params_->lattice_beam = lattice_beam_;
   h_device_params_->main_q_capacity = main_q_capacity_;
@@ -296,7 +296,7 @@ void CudaDecoder::InitDeviceParams() {
   h_device_params_->nstates = fst_.num_states_;
   h_device_params_->init_state = fst_.Start();
   KALDI_ASSERT(h_device_params_->init_state != fst::kNoStateId);
-  h_device_params_->init_cost = StdWeight::One().Value();
+  h_device_params_->init_cost = CudaFst::Weight::One().Value();
   h_device_params_->hashmap_capacity = hashmap_capacity_;
   h_device_params_->max_active = max_active_;
   // For the first static_beam_q_length elements of the queue, we will
@@ -318,7 +318,7 @@ void CudaDecoder::InitDeviceParams() {
   // Those cannot be used at the same time
   h_device_params_->h_list_final_tokens_in_main_q =
       h_list_final_tokens_in_main_q_.GetView();
-  h_device_params_->fst_zero = StdWeight::Zero().Value();
+  h_device_params_->fst_zero = CudaFst::Weight::Zero().Value();
 }
 
 CudaDecoder::~CudaDecoder() noexcept(false) {
@@ -350,9 +350,6 @@ CudaDecoder::~CudaDecoder() noexcept(false) {
   KALDI_DECODER_CUDA_API_CHECK_ERROR(
       cudaEventDestroy(concatenated_data_ready_evt_));
   KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(lane_offsets_ready_evt_));
-
-  delete h_kernel_params_;
-  delete h_device_params_;
 }
 
 void CudaDecoder::ComputeInitialChannel() {
@@ -1072,7 +1069,7 @@ void CudaDecoder::GetBestPredecessor(int32 ichannel, int32 curr_token_idx,
     int32 offset, size;
     std::tie(offset, size) = token.GetSameFSTStateTokensList();
     bool found_best = false;
-    for (auto i = 0; i < size; ++i) {
+    for (int32 i = 0; i < size; ++i) {
       KALDI_ASSERT(
           (offset + i) <
           h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
@@ -1243,7 +1240,7 @@ void CudaDecoder::AddFinalTokensToLattice(
   const CostType best_cost = h_all_argmin_cost_[ichannel].second;
   // Iterating through tokens associated with a final state in the last
   // frame
-  for (auto &p : h_all_final_tokens_list_[ichannel]) {
+  for (const auto &p : h_all_final_tokens_list_[ichannel]) {
     // This final token has a final cost of final_token_cost
     CostType final_token_cost = p.second;
     // This token has possibly an extra cost compared to the best

diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
@@ -197,12 +197,15 @@ class CudaDecoder {
   // Using nlanes=2500 in that configuration would first not be possible
   // (out of memory), but also not necessary. Increasing the number of
   // lanes is only useful if it increases performance. If the GPU is
-  // saturated at nlanes=200, you should not increase that number
+  // saturated at nlanes=200, you should not increase that number.
+  //
+  ///\param[in] fst A CudaFst instance. Not owned, must survive this object.
+  ///\param[in] config
+  ///\param[in] nlanes
+  ///\param[in] nchannels
   CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config, int32 nlanes,
               int32 nchannels);
 
-  // Reads the config from config
-  void ReadConfig(const CudaDecoderConfig &config);
   // Special constructor for nlanes = nchannels. Here for the non-advanced
   // user Here we can consider nchannels = batch size. If we want to
   // decode 10 utterances at a time, we can use nchannels = 10
@@ -211,6 +214,10 @@ class CudaDecoder {
       : CudaDecoder(fst, config, nchannels, nchannels) {}
   virtual ~CudaDecoder() noexcept(false);
 
+  KALDI_DISALLOW_COPY_AND_ASSIGN(CudaDecoder);
+
+  // Reads the config from config
+  void ReadConfig(const CudaDecoderConfig &config);
   // InitDecoding initializes the decoding, and should only be used if you
   // intend to call AdvanceDecoding() on the channels listed in channels
   void InitDecoding(const std::vector<ChannelId> &channels);
@@ -511,7 +518,7 @@ class CudaDecoder {
 
   // The CudaFst data structure contains the FST graph
   // in the CSR format, on both the GPU and CPU memory
-  const CudaFst fst_;
+  const CudaFst& fst_;
 
   // Counters used by a decoder lane
   // Contains all the single values generated during computation,
@@ -698,8 +705,8 @@ class CudaDecoder {
   // i.e. memory address of the main_q for instance
   // KernelParams contains information that can change.
   // For instance which channel is executing on which lane
-  DeviceParams *h_device_params_;
-  KernelParams *h_kernel_params_;
+  std::unique_ptr<DeviceParams> h_device_params_;
+  std::unique_ptr<KernelParams> h_kernel_params_;
   std::vector<ChannelId> channel_to_compute_;
   int32 nlanes_used_;  // number of lanes used in h_kernel_params_
   // Initial lane
@@ -930,7 +937,6 @@ class CudaDecoder {
       std::unordered_map<LatticeStateInternalId, RawLatticeState>
           *prev_f_raw_lattice_state,
       std::unordered_set<int32> *f_arc_idx_added);
-  KALDI_DISALLOW_COPY_AND_ASSIGN(CudaDecoder);
 };
 
 }  // end namespace cuda_decoder