config.yaml


voc_model_id: 'ljspeech_voc'
tts_model_id: 'ljspeech_tts'
data_path: '/content/content/dataset_folder'                                        # output data path

tts_model: 'forward_tacotron'                             # choices: [forward_tacotron, fast_pitch]


dsp:

  sample_rate: 22050
  n_fft: 1024
  num_mels: 80
  hop_length: 256
  win_length: 1024
  fmin: 0
  fmax: 8000

  peak_norm: False                      # Normalise to the peak of each wav file
  trim_start_end_silence: True          # Whether to trim leading and trailing silence
  trim_silence_top_db: 60               # Threshold in decibels below reference to consider silence for for trimming
                                        # start and end silences with librosa (no trimming if really high)
  pitch_max_freq: 600                   # Maximum value for pitch frequency to remove outliers (Common pitch range is
                                        # about 60-300)
  trim_long_silences: False             # Whether to reduce long silence using WebRTC Voice Activity Detector
  vad_window_length: 30                 # In milliseconds
  vad_moving_average_width: 8
  vad_max_silence_length: 12
  vad_sample_rate: 16000

  # vocoder
  voc_mode: 'RAW'                    # choices: [RAW, MOL] - MOL needs longer training and yields better quality
  bits: 9                            # bit depth of signal
  mu_law: True                       # Recommended to suppress noise if using raw bits in hp.voc_mode below


preprocessing:

  seed: 42
  n_val: 200
  language: 'ca'
  cleaner_name: 'english_cleaners'   # choices: ['english_cleaners', 'no_cleaners'], expands numbers and abbreviations.
  use_phonemes: True                 # whether to phonemize the text
                                     # if set to False, you have to provide the phonemized text yourself
  min_text_len: 2
  extract_durations_with_dijkstra: True    # slower but much more robust than simply counting attention peaks


tacotron:

  model:
    embed_dims: 256
    encoder_dims: 128
    decoder_dims: 256
    postnet_dims: 128
    encoder_k: 16
    lstm_dims: 512
    postnet_k: 8
    num_highways: 4
    dropout: 0.5
    stop_threshold: -11             # Value below which audio generation ends.

  training:
    schedule:
      - 5,  1e-3,  10_000,  32      # progressive training schedule
      - 3,   1e-4,  20_000,  16     # (r, lr, step, batch_size)
      - 2,   1e-4,  30_000,  8
      - 1,   1e-4,  40_000,  8

    max_mel_len: 1250                # if you have a couple of extremely long spectrograms you might want to use this
    clip_grad_norm: 1.0              # clips the gradient norm to prevent explosion - set to None if not needed
    checkpoint_every: 4000          # checkpoints the model every x steps
    plot_every: 1000                 # generates samples and plots every x steps


forward_tacotron:

  model:
    embed_dims: 256                 # embedding dimension for main model
    series_embed_dims: 64           # embedding dimension for series predictor

    durpred_conv_dims: 256
    durpred_rnn_dims: 64
    durpred_dropout: 0.5

    pitch_conv_dims: 256
    pitch_rnn_dims: 128
    pitch_dropout: 0.5
    pitch_strength: 1.              # set to 0 if you want no pitch conditioning

    energy_conv_dims: 256
    energy_rnn_dims: 64
    energy_dropout: 0.5
    energy_strength: 1.             # set to 0 if you want no energy conditioning

    prenet_dims: 256
    prenet_k: 16
    prenet_dropout: 0.5
    prenet_num_highways: 4

    rnn_dims: 512

    postnet_dims: 256
    postnet_k: 8
    postnet_num_highways: 4
    postnet_dropout: 0.

  training:
    schedule:
      - 5e-5,  150_000,  32       # progressive training schedule
      - 1e-5,  300_000,  32       # lr, step, batch_size
    dur_loss_factor: 0.1
    pitch_loss_factor: 0.1
    energy_loss_factor: 0.1
    pitch_zoneout: 0.             # zoneout may regularize conditioning on pitch
    energy_zoneout: 0.            # zoneout may regularize conditioning on energy

    max_mel_len: 1250
    clip_grad_norm: 1.0           # clips the gradient norm to prevent explosion - set to None if not needed
    checkpoint_every: 5_000      # checkpoints the model every x steps
    plot_every: 1000              # generates samples and plots every x steps

    filter_attention: True               # whether to filter data with bad attention scores
    min_attention_sharpness: 0.5         # filter data with bad attention sharpness score, if 0 then no filter
    min_attention_alignment: 0.75        # filter data with bad attention alignment score, if 0 then no filter


fast_pitch:

  model:
    durpred_d_model: 128
    durpred_n_heads: 2
    durpred_layers: 4
    durpred_d_fft: 128
    durpred_dropout: 0.5

    pitch_d_model: 128
    pitch_n_heads: 2
    pitch_layers: 4
    pitch_d_fft: 128
    pitch_dropout: 0.5
    pitch_strength: 1.0

    energy_d_model: 128
    energy_n_heads: 2
    energy_layers: 4
    energy_d_fft: 128
    energy_dropout: 0.5
    energy_strength: 1.0

    d_model: 256
    conv1_kernel: 9
    conv2_kernel: 1

    prenet_layers: 4
    prenet_heads: 2
    prenet_fft: 1024
    prenet_dropout: 0.1

    postnet_layers: 4
    postnet_heads: 2
    postnet_fft: 1024
    postnet_dropout: 0.1


  training:
    schedule:
      - 1e-5,  5_000,  32         # progressive training schedule
      - 5e-5,  100_000,  32       # lr, step, batch_size
      - 2e-5,  300_000,  32
    dur_loss_factor: 0.1
    pitch_loss_factor: 0.1
    energy_loss_factor: 0.1
    pitch_zoneout: 0.             # zoneout may regularize conditioning on pitch
    energy_zoneout: 0.            # zoneout may regularize conditioning on energy

    max_mel_len: 1250
    clip_grad_norm: 1.0           # clips the gradient norm to prevent explosion - set to None if not needed
    checkpoint_every: 5_000      # checkpoints the model every x steps
    plot_every: 1000

    filter_attention: True               # whether to filter data with bad attention scores
    min_attention_sharpness: 0.5         # filter data with bad attention sharpness score, if 0 then no filter
    min_attention_alignment: 0.95        # filter data with bad attention alignment score, if 0 then no filter


vocoder:

  model:
    mode: 'RAW'                     # choices ['RAW', 'MOL']
                                    # 'RAW' = softmax on raw bits,  'MOL' = sample from mixture of logistics
    upsample_factors: [4, 8, 8]     # NB - this needs to correctly factorise hop_length
    rnn_dims: 512
    fc_dims: 512
    compute_dims: 128
    res_out_dims: 128
    res_blocks: 10
    pad: 2                         # this will pad the input so that the resnet can 'see' wider than input length

  training:
    schedule:
      - 1e-4,  300_000,  32        # progressive training schedule
      - 1e-5,  600_000,  32        # lr, step, batch_size

    checkpoint_every: 25_000
    gen_samples_every: 5000        # how often to generate samples for cherry-picking models
    num_gen_samples: 3             # number of samples to generate for cherry-picking models
    keep_top_k: 3                  # how many top performing models to keep
    seq_len: 1280                  # must be a multiple of hop_length
    clip_grad_norm: 4              # set to None if no gradient clipping needed
    max_mel_len: 20000

    # Generating / Synthesizing
    gen_batched: True              # very fast (realtime+) single utterance batched generation
    target: 11_000                 # target number of samples to be generated in each batch entry
    overlap: 550                   # number of samples for crossfading between batches