small updates for libritts

yerfor · May 16, 2022 · 7bbc7c0 · 7bbc7c0
1 parent d4f300e
commit 7bbc7c0
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -79,7 +79,7 @@ CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config egs/tts/biaobei/synta.yaml -
 
 Audio samples in the paper can be found in our [demo page](https://syntaspeech.github.io/).
 
-We also provide [HuggingFace Demo Page](https://huggingface.co/spaces/NATSpeech/PortaSpeech) for LJSpeech. Try your interesting sentences there!
+We also provide [HuggingFace Demo Page](https://huggingface.co/spaces/yerfor/SyntaSpeech) for LJSpeech. Try your interesting sentences there!
 
 ## Citation
 

diff --git a/egs/datasets/audio/libritts/base_text2mel.yaml b/egs/datasets/audio/libritts/base_text2mel.yaml
@@ -8,7 +8,7 @@ binarization_args:
   train_range: [ 871, -1 ]
   test_range: [ 0, 523 ]
   valid_range: [ 523, 871 ]
-  shuffle: false
+  shuffle: true
   with_spk_id: true
   with_spk_embed: false
 test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,

diff --git a/egs/datasets/audio/lj/base_text2mel.yaml b/egs/datasets/audio/lj/base_text2mel.yaml
@@ -14,4 +14,5 @@ test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
             316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
 f0_min: 80
 f0_max: 600
-vocoder_ckpt: checkpoints/hifi_lj
+vocoder_ckpt: checkpoints/hifi_lj
+num_valid_plots: 30
diff --git a/modules/tts/syntaspeech/syntaspeech.py b/modules/tts/syntaspeech/syntaspeech.py
@@ -120,6 +120,7 @@ def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel
         style_embed = self.forward_style_embed(spk_embed, spk_id) # speaker embedding, [B, 1, C]
         x, tgt_nonpadding = self.run_text_encoder(
             txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, graph_lst=graph_lst, etypes_lst=etypes_lst)
+        x = x + style_embed # it maybe necessary to achieve multi-speaker
         x = x * tgt_nonpadding
         ret['nonpadding'] = tgt_nonpadding
         if self.hparams['use_pitch_embed']:

diff --git a/tasks/tts/dataset_utils.py b/tasks/tts/dataset_utils.py
@@ -30,8 +30,17 @@ def __init__(self, prefix, shuffle=False, items=None, data_dir=None):
                 self.avail_idxs = list(range(len(self.sizes)))
             if prefix == 'train' and hparams['min_frames'] > 0:
                 self.avail_idxs = [x for x in self.avail_idxs if self.sizes[x] >= hparams['min_frames']]
-            self.sizes = [self.sizes[i] for i in self.avail_idxs]
-
+            try:
+                self.sizes = [self.sizes[i] for i in self.avail_idxs]
+            except:
+                tmp_sizes = []
+                for i in self.avail_idxs:
+                    try:
+                        tmp_sizes.append(self.sizes[i])
+                    except:
+                        continue
+                self.sizes = tmp_sizes
+
     def _get_item(self, index):
         if hasattr(self, 'avail_idxs') and self.avail_idxs is not None:
             index = self.avail_idxs[index]