infer.py

from sklearn.utils.validation import _num_samples
from torchaudio import transforms as T
from torchaudio import load
from pyAudioAnalysis.audioSegmentation import speaker_diarization
import numpy as np 


from transformers import pipeline

from dataclasses import dataclass

from collections import defaultdict

import torch
from transformers import WhisperProcessor

from nltk import sent_tokenize

# DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# PYTORCH_ENABLE_MPS_FALLBACK=1
# pretrained model path
# PRETRAINED = "openai/whisper-small"
PRETRAINED = "./models/laced-violet-28"
# PRETRAINED = "openai/whisper-large-v2"
# FILE = "./data/test.wav"
FILE = "../talkbank-alignment/testing_playground_2/input/test.wav"
# FILE = "../talkbank-alignment/broken2/input/53.wav"

@dataclass
class ASRAudioFile:
    file : str
    tensor : torch.Tensor
    rate : int

    def chunk(self,begin_ms, end_ms):
        """Get a chunk of the audio.

        Parameters
        ----------
        begin_ms : int
            Milliseconds of the start of the slice.
        end_ms : int
            Milliseconds of the end of the slice.

        Returns
        -------
        torch.Tensor
            The returned chunk to supply to the ASR engine.
        """

        data = self.tensor[int(round((begin_ms/1000)*self.rate)):
                           int(round((end_ms/1000)*self.rate))]

        return data

    def all(self):
        """Get the audio in its entirety

        Notes
        -----
        like `chunk()` but all of the audio
        """

        return self.tensor

# inference engine
class ASREngine(object):
    """An ASR Engine

    Parameters
    ----------
    model : str
        The model path to load from.
    target_sample_rate : optional, int
        The sample rate to cast to. Defaults 16000 by Whisper.

    Example
    -------
    >>> engine = ASREngine("./model/my_model")
    >>> file = engine.load("./data/myfile.wav")
    >>> engine(file.chunk(7000, 13000)) # transcribes 7000th ms to 13000th ms
    """

    def __init__(self, model, language="english", target_sample_rate=16000):
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            chunk_length_s=30,
            stride_length_s=3,
            device=DEVICE,
            return_timestamps="word",
        )
        processor = WhisperProcessor.from_pretrained(model)

        # force decoder IDs to create language
        self.__decoder_ids = processor.get_decoder_prompt_ids(language=language,
                                                              task="transcribe")
        self.__prompt_ids = processor.get_prompt_ids("I am going to pause a lot, stutter, and say a lot of ums and uhs, and I will will will uh write them down. Um here we go.")

        # save the target sample rate
        self.sample_rate = target_sample_rate

    def load(self, f, num_speakers):
        """Load an audio file for procesing.

        Parameters
        ----------
        f : str
            The audio .wav file name to process.
        num_speakers : int
            The number of speakers

        Returns
        -------
        Tuple[ASRAudioFile, List[dict]]
            Return processed audio file and speaker segments.
        """

        # function: load and resample audio
        audio_arr, rate = load(f)

        # resample if needed
        if rate != self.sample_rate:
            audio_arr = T.Resample(rate, self.sample_rate)(audio_arr)

        # transpose and mean
        resampled = torch.mean(audio_arr.transpose(0,1), dim=1)

        # perform diarization
        if num_speakers == 1:
            dia_cls = None
        else:
            print("Diarizing...")
            dia_cls = speaker_diarization(f, num_speakers, mid_step=0.1, lda_dim=5)[0]
            print("Done Diarizing")

        # and return the audio file
        return ASRAudioFile(f, resampled, self.sample_rate), dia_cls

    def __call__(self, data, segments):
        # we now perform the sweep line algorithm to align the
        # segment timestamps against the words
        groups = []

        if segments is not None:
            secs = np.array(range(len(segments))) * 0.5 + 0.1 / 2.0
            cur_start = 0
            cur_spk = segments[0]

            for indx, i in zip(secs, segments):
                if i != cur_spk:
                    # results is by 0.1 second steps
                    groups.append({
                        "type": "segment",
                        "start": cur_start/10,
                        "end": indx/10,
                        "payload": int(cur_spk)
                    })
                    cur_start = indx
                    cur_spk = i
        else:
            groups.append({
                "type": "segment",
                "start": 0,
                "end": len(data)/self.sample_rate,
                "payload": 0
            })

        words = self.pipe(data.cpu().numpy(),
                          batch_size=8, 
                          generate_kwargs = {
                              "forced_decoder_ids": self.__decoder_ids,
                              "prompt_ids": self.__prompt_ids,
                              "repetition_penalty": 1.2
                          })
                                             # "do_sample": True,
                                             # "temperature": 0.1
                                             # })
        # breakpoint()
                                             # "temperature": 0,
  #"temperature": 0.75,
                                             # })
        words = words["chunks"]

        for word in words:
            groups.append({
                "type": "text",
                "start": word["timestamp"][0],
                "end": word["timestamp"][1],
                "payload": word["text"]
            })

        # sorting the output to perform sweep
        groups = list(sorted(groups, key=lambda x:x["start"]))

        # tally turns together
        turns = []
        current_speaker = 0
        current_turn = []

        current_segment = groups.pop(0)
        while len(groups) > 0:
            element = groups.pop(0)

            if element["type"] == "text":
                current_turn.append({
                    "type": "text",
                    "ts": element["start"],
                    "end_ts": element["end"] if element["end"] else element["start"]+1,
                    "value": element["payload"].strip(),
                })
            elif element["type"] == "segment" and current_speaker != element["payload"]:
                turns.append({
                    "elements": current_turn,
                    "speaker": current_speaker[0] if type(current_speaker) == tuple else current_speaker
                })
                current_speaker = element["payload"],
                current_turn = []

        turns.append({
            "elements": current_turn,
            "speaker": current_speaker[0] if type(current_speaker) == tuple else current_speaker
        })

        return {
            "monologues": turns
        }


# mid_step = 0.1
# dia_cls = raw_dia[0]

# # create the segments
# groups = [] 
# cur_start = 0
# cur_spk = dia_cls[0]

# for indx, i in zip(secs, dia_cls):
#     if i != cur_spk:
#         # results is by 0.1 second steps
#         groups.append({
#             "type": "segment",
#             "start": cur_start/10,
#             "end": indx/10,
#             "payload": cur_spk
#         })
#         cur_start = indx
#         cur_spk = i

# groups

# raw_dia[:1000]

e = ASREngine(PRETRAINED, "english")
audio, segments = e.load(FILE, 1)
result = e(audio.all(), segments)
result
# result
# # words = raw["chunks"]