allow maximal flexibility in normalizing, either when loading data, o…

…r doing spectrogram transform lucidrains#82
cmu-mlsp · Feb 8, 2023 · 1831dae · 1831dae
1 parent 0863e89
commit 1831dae
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 8 deletions.
diff --git a/audiolm_pytorch/data.py b/audiolm_pytorch/data.py
@@ -37,6 +37,7 @@ def __init__(
         self,
         folder,
         exts = ['flac', 'wav'],
+        normalize = False,
         max_length: OptionalIntOrTupleInt = None,
         target_sample_hz: OptionalIntOrTupleInt = None,
         seq_len_multiple_of: OptionalIntOrTupleInt = None
@@ -49,6 +50,7 @@ def __init__(
         assert len(files) > 0, 'no sound files found'
 
         self.files = files
+        self.normalize = normalize
 
         self.target_sample_hz = cast_tuple(target_sample_hz)
         num_outputs = len(self.target_sample_hz)
@@ -64,7 +66,7 @@ def __len__(self):
     def __getitem__(self, idx):
         file = self.files[idx]
 
-        data, sample_hz = torchaudio.load(file)
+        data, sample_hz = torchaudio.load(file, normalize = self.normalize)
 
         assert data.numel() > 0, f'one of your audio file ({file}) is empty. please remove it from your folder'
 

diff --git a/audiolm_pytorch/soundstream.py b/audiolm_pytorch/soundstream.py
@@ -167,7 +167,8 @@ def __init__(
         input_channels = 1,
         n_fft = 1024,
         hop_length = 256,
-        win_length = 1024
+        win_length = 1024,
+        normalized = False
     ):
         super().__init__()
         self.init_conv = ComplexConv2d(input_channels, channels, 7, padding = 3)
@@ -187,6 +188,8 @@ def __init__(
 
         # stft settings
 
+        self.normalized = normalized
+
         self.n_fft = n_fft
         self.hop_length = hop_length
         self.win_length = win_length
@@ -207,6 +210,7 @@ def forward(self, x, return_intermediates = False):
             self.n_fft,
             hop_length = self.hop_length,
             win_length = self.win_length,
+            normalized = self.normalized,
             return_complex = True
         )
 
@@ -348,11 +352,13 @@ def __init__(
         rq_ema_decay = 0.95,
         input_channels = 1,
         discr_multi_scales = (1, 0.5, 0.25),
+        stft_normalized = False,
         enc_cycle_dilations = (1, 3, 9),
         dec_cycle_dilations = (1, 3, 9),
         multi_spectral_window_powers_of_two = tuple(range(6, 12)),
         multi_spectral_n_ffts = 512,
         multi_spectral_n_mels = 64,
+        multi_spectral_normalized = False,
         recon_loss_weight = 1.,
         multi_spectral_recon_loss_weight = 1.,
         adversarial_loss_weight = 1.,
@@ -440,7 +446,9 @@ def __init__(
         self.discr_multi_scales = discr_multi_scales
         self.discriminators = nn.ModuleList([MultiScaleDiscriminator() for _ in range(len(discr_multi_scales))])
 
-        self.stft_discriminator = ComplexSTFTDiscriminator()
+        self.stft_discriminator = ComplexSTFTDiscriminator(
+            normalized = stft_normalized
+        )
 
         # multi spectral reconstruction
 
@@ -465,6 +473,7 @@ def __init__(
                 win_length = win_length,
                 hop_length = win_length // 4,
                 n_mels = n_mels,
+                normalized = multi_spectral_normalized
             )
 
             self.mel_spec_transforms.append(melspec_transform)

diff --git a/audiolm_pytorch/trainer.py b/audiolm_pytorch/trainer.py
@@ -117,6 +117,7 @@ def __init__(
         batch_size,
         data_max_length = None,
         folder,
+        dataset_normalize = False,
         lr = 2e-4,
         grad_accum_every = 4,
         wd = 0.,
@@ -167,7 +168,8 @@ def __init__(
             folder,
             max_length = data_max_length,
             target_sample_hz = soundstream.target_sample_hz,
-            seq_len_multiple_of = soundstream.seq_len_multiple_of
+            seq_len_multiple_of = soundstream.seq_len_multiple_of,
+            normalize = dataset_normalize
         )
 
         # split for validation
@@ -435,6 +437,7 @@ def __init__(
         audio_conditioner: Optional[AudioConditionerBase] = None,
         dataset: Optional[Dataset] = None,
         data_max_length = None,
+        dataset_normalize = False,
         folder = None,
         lr = 3e-4,
         grad_accum_every = 1,
@@ -484,7 +487,8 @@ def __init__(
                 folder,
                 max_length = data_max_length,
                 target_sample_hz = wav2vec.target_sample_hz,
-                seq_len_multiple_of = wav2vec.seq_len_multiple_of
+                seq_len_multiple_of = wav2vec.seq_len_multiple_of,
+                normalize = dataset_normalize
             )
 
         self.ds_fields = None
@@ -664,6 +668,7 @@ def __init__(
         dataset: Optional[Dataset] = None,
         ds_fields: Tuple[str, ...] = ('raw_wave', 'raw_wave_for_soundstream', 'text'),
         data_max_length = None,
+        dataset_normalize = False,
         folder = None,
         lr = 3e-4,
         grad_accum_every = 1,
@@ -719,7 +724,8 @@ def __init__(
                     wav2vec.target_sample_hz,
                     soundstream.target_sample_hz
                 ), # need 2 waves resampled differently here
-                seq_len_multiple_of = soundstream.seq_len_multiple_of
+                seq_len_multiple_of = soundstream.seq_len_multiple_of,
+                normalize = dataset_normalize
             )
 
         self.ds_fields = ds_fields
@@ -900,6 +906,7 @@ def __init__(
         audio_conditioner: Optional[AudioConditionerBase] = None,
         dataset: Optional[Dataset] = None,
         data_max_length = None,
+        dataset_normalize = False,
         folder = None,
         lr = 3e-4,
         grad_accum_every = 1,
@@ -950,7 +957,8 @@ def __init__(
                 folder,
                 max_length = data_max_length,
                 target_sample_hz = soundstream.target_sample_hz,
-                seq_len_multiple_of = soundstream.seq_len_multiple_of
+                seq_len_multiple_of = soundstream.seq_len_multiple_of,
+                normalize = dataset_normalize
             )
 
         self.ds_fields = None

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'audiolm-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.11.12',
+  version = '0.11.14',
   license='MIT',
   description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch',
   author = 'Phil Wang',