dataloaders.py

"""
File for loading data from the correct folders into the model
"""

import numpy as np
import glob
from torch.utils.data import Dataset
import config


def get_element_train(idx, data, train = 0):
    """
    Used in __getitem__ of the dataloader class. Takes the data and label paths and return the 
    data stored inside them

    Parameters
    ----------
    idx : int
        Index of the data to be unpackaged. Randomly generated by the dataloader class.
    data : list
        List containing both the path to the data file and its corresponding label file.

    Returns
    -------
    data_sequence : numpy array
        The raw speech data in 16 bit format.
    label_sequence : numpy array
        The corresponding labels matching the speech data.

    """
    
    data_tensor = []
    label_tensor = []
    if config.noise_flag:
        data_path, label_path = data[idx]
        data_tensor.append(np.fromfile(data_path,dtype='>i2'))
        label_tensor.append(np.fromfile(label_path,sep="\n"))
        
        indices_backslash = [i for i, ltr in enumerate(data_path) if ltr == '\\']
    
        noise_type = data_path[indices_backslash[-2]+1:indices_backslash[-2]+3]
        SNR = data_path[indices_backslash[-2]+1:indices_backslash[-1]]
        data_tensor[-1] = data_tensor[-1][0:len(label_tensor[-1])*80]
        x = np.zeros((1,1,sum(len(item) for item in data_tensor)))
        y = np.zeros((sum(len(item) for item in label_tensor)))
        x[0,0,:] = np.hstack(data_tensor)
        y = np.hstack(label_tensor)  
        
        return x, y, noise_type, SNR
    else:
        while(1):
            # noise_type = np.random.randint(0,19)
            data_path, label_path = data[idx]
            
            
            indices_backslash = [i for i, ltr in enumerate(data_path) if ltr == '\\']
            SNR = data_path[indices_backslash[-2]+1:indices_backslash[-1]]
            noise_type = data_path[indices_backslash[-2]+1:indices_backslash[-2]+3]
            if SNR == config.noiseT[0] or SNR[0:-1] == config.noiseT[0][0:-1]:
                data_tensor.append(np.fromfile(data_path,dtype='>i2'))
                label_tensor.append(np.fromfile(label_path,sep="\n"))
                data_tensor[-1] = data_tensor[-1][0:len(label_tensor[-1])*80]
                x = np.zeros((1,1,sum(len(item) for item in data_tensor)))
                y = np.zeros((sum(len(item) for item in label_tensor)))
                x[0,0,:] = np.hstack(data_tensor)
                y = np.hstack(label_tensor)  
            
                return x, y, noise_type, SNR
            else:
                idx = np.random.randint(0,8439)

def get_paths_train(SNR, train = 0, noise_type = "string"):
    """
    Used in the __init__ function in the dataloaders. Finds the paths to the desired train/test set 
    and returns them as a list. The labels and data paths are sorted in match_labels() 

    Parameters
    ----------
    SNR : string
        The SNR of the desired test set.
    train : int, optional
        Flag to check if test or train set is desired. The default is 0, which corresponds to test set.

    Returns
    -------
    data : list
        Sorted list containing all the data- and label paths.

    """
    
    data_folder_path = config.training_data_path
    label_folder_path = config.training_label_path

    folders_data = glob.glob(data_folder_path + "/*/")
    label_file_list = glob.glob(label_folder_path + "/*")
    data_file_list = []
    data = []
    for folder_data in folders_data:
        data_file_list_tmp = glob.glob(folder_data + "/*.08")
        for data_path in data_file_list_tmp:
            data_file_list.append(data_path)

    for idx, (file_path) in enumerate(data_file_list):
        indices_backslash = [i for i, ltr in enumerate(file_path) if ltr == '\\']
        label_file_list[idx] = f"{label_folder_path}\\{file_path[indices_backslash[-1]:-3]}"
        
    for index, (data_path, label_path) in enumerate(zip(data_file_list, label_file_list)):
            data.append([data_path,label_path])
    return data


"""
Each dataloader inherits the Dataset class. An __init__, __len__ and __getitem__ function are 
necessary for the dataloader to work. One dataloader is created for the test and validation set
and one is created for the training set
"""
class AURORA2_train(Dataset):
    def __init__(self):
        self.data = get_paths_train("0", train=1)
      
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        return get_element_train(idx, self.data, train=1)
    

    
class AURORA2_test(Dataset):
    def __init__(self):
        data_folder_path = config.testing_data_path
        label_folder_path = config.testing_label_path
        
        """Finds the folder corresponding to the desired noise type and SNR level """
        if config.SNR_level_AURORA == "CLEA":
            folders = f"CLEA{config.noise_type_AURORA}"
        else:
            folders = f"{config.noise_type_AURORA}_SNR{config.SNR_level_AURORA}"
        """ Finds the paths for all the desired speech files"""
        folders_data = glob.glob(data_folder_path + f"\{folders}*")
        data_file_list = []
        for i, (folder_data) in enumerate(folders_data):
            data_file_list.append([])
            data_file_list_tmp = glob.glob(folder_data + "/*.08")

        """Finds the paths to the label files using the path to the speech file """ 
        label_file_list = data_file_list_tmp.copy()
        for idx, (file_path) in enumerate(data_file_list_tmp):
            indices_backslash = [i for i, ltr in enumerate(file_path) if ltr == '\\']
            label_file_list[idx] = f"{label_folder_path}\\{file_path[indices_backslash[-1]:-3]}"
        
        """ Zips the speech paths and the label paths into one list"""
        self.data = []
        for index, (data_path, label_path) in enumerate(zip(data_file_list_tmp, label_file_list)):
            self.data.append([data_path,label_path])

     # %%       
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        """
        Function for unpacking the speech and label files and save them into numpy arrays
        """        
        data_tensor = []
        label_tensor = []
        data_tensor.append([])
        label_tensor.append([])
        
        """ For the validation split"""
        if config.validation:
            if idx <= 333:
                data_path, label_path = self.data[idx]
                data_tensor.append(np.fromfile(data_path,dtype='>i2'))
                label_tensor.append(np.fromfile(label_path,sep="\n"))
                data_tensor = data_tensor[0:len(label_tensor)*80]
                x = np.zeros((1,1,sum(len(item) for item in data_tensor)))
                y = np.zeros((sum(len(item) for item in label_tensor)))
                x[0,0,:] = np.hstack(data_tensor)
                y = np.hstack(label_tensor)
                return x,y
            
            """If the randomly picked file is not in the right split, return empty arrays"""
            emptyX = np.zeros((1,1,0))
            emptyy = np.zeros((1))
            return emptyX,emptyy
            
            """ For the testing split"""
        else:
            if idx > 333:
                data_path, label_path = self.data[idx]
                data_tensor.append(np.fromfile(data_path,dtype='>i2'))
                label_tensor.append(np.fromfile(label_path,sep="\n"))
                data_tensor = data_tensor[0:len(label_tensor)*80]
                x = np.zeros((1,1,sum(len(item) for item in data_tensor)))
                y = np.zeros((sum(len(item) for item in label_tensor)))
                x[0,0,:] = np.hstack(data_tensor)
                y = np.hstack(label_tensor)
                return x, y
            
            """If the randomly picked file is not in the right split, return empty arrays"""
            emptyX = np.zeros((1,1,0))
            emptyy = np.zeros((1))
            return emptyX,emptyy