From 5a794e80a523323487d47de57d551434b42145e0 Mon Sep 17 00:00:00 2001 From: JarbasAI <33701864+JarbasAl@users.noreply.github.com> Date: Thu, 20 Jun 2024 22:06:32 +0100 Subject: [PATCH] feat/alternative_transcripts (#124) * feat/alternative_transcripts closes https://github.com/OpenVoiceOS/ovos-plugin-manager/issues/46 needs https://github.com/OpenVoiceOS/ovos-plugin-manager/pull/236 * . * Update requirements.txt * Update voice_loop.py * configurable min_conf * from mycroft.conf * max_transcripts --- ovos_dinkum_listener/plugins.py | 24 ++++++++--- ovos_dinkum_listener/service.py | 40 ++++++++++--------- ovos_dinkum_listener/voice_loop/voice_loop.py | 37 ++++++++++------- requirements/requirements.txt | 2 +- 4 files changed, 64 insertions(+), 39 deletions(-) diff --git a/ovos_dinkum_listener/plugins.py b/ovos_dinkum_listener/plugins.py index 9fa7cb8..3ef83bd 100644 --- a/ovos_dinkum_listener/plugins.py +++ b/ovos_dinkum_listener/plugins.py @@ -1,8 +1,9 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, List, Tuple + +from ovos_config.config import Configuration from ovos_plugin_manager.stt import OVOSSTTFactory from ovos_plugin_manager.templates.stt import StreamingSTT, StreamThread from ovos_plugin_manager.utils import ReadWriteStream -from ovos_config.config import Configuration from ovos_utils.log import LOG from speech_recognition import AudioData @@ -18,11 +19,11 @@ def __init__(self, queue, language, engine, sample_rate, sample_width): def finalize(self): """ return final transcription """ - - if not self.buffer: + + if not self.buffer: return "" - try: + try: # plugins expect AudioData objects audio = AudioData(self.buffer.read(), sample_rate=self.sample_rate, @@ -55,6 +56,19 @@ def create_streaming_thread(self): return FakeStreamThread(self.queue, self.lang, self.engine, sample_rate, sample_width) + def transcribe(self, audio: Optional = None, + lang: Optional[str] = None) -> List[Tuple[str, float]]: + """transcribe audio data to a list of + possible transcriptions and respective confidences""" + # plugins expect AudioData objects + audiod = AudioData(audio or self.stream.buffer.read(), + sample_rate=self.stream.sample_rate, + sample_width=self.stream.sample_width) + transcripts = self.engine.transcribe(audiod, lang) + if audio is None: + self.stream.buffer.clear() + return transcripts + def load_stt_module(config: Dict[str, Any] = None) -> StreamingSTT: """ diff --git a/ovos_dinkum_listener/service.py b/ovos_dinkum_listener/service.py index 6c94e97..a382b18 100644 --- a/ovos_dinkum_listener/service.py +++ b/ovos_dinkum_listener/service.py @@ -20,6 +20,7 @@ from pathlib import Path from tempfile import NamedTemporaryFile from threading import Thread, RLock, Event +from typing import List, Tuple import speech_recognition as sr from distutils.spawn import find_executable @@ -261,19 +262,15 @@ def _init_voice_loop(self, listener_config: dict): fallback_stt=self.fallback_stt, vad=self.vad, transformers=self.transformers, - instant_listen=listener_config.get("instant_listen"), + instant_listen=listener_config.get("instant_listen", True), speech_seconds=listener_config.get("speech_begin", 0.3), silence_seconds=listener_config.get("silence_end", 0.7), timeout_seconds=listener_config.get("recording_timeout", 10), timeout_seconds_with_silence=listener_config.get("recording_timeout_with_silence", 5), recording_mode_max_silence_seconds=listener_config.get("recording_mode_max_silence_seconds", 30), - num_stt_rewind_chunks=listener_config.get( - "utterance_chunks_to_rewind", 2), - num_hotword_keep_chunks=listener_config.get( - "wakeword_chunks_to_save", 15), - remove_silence=listener_config.get( - "remove_silence", False), - # + num_stt_rewind_chunks=listener_config.get("utterance_chunks_to_rewind", 2), + num_hotword_keep_chunks=listener_config.get("wakeword_chunks_to_save", 15), + remove_silence=listener_config.get("remove_silence", False), wake_callback=self._record_begin, text_callback=self._stt_text, listenword_audio_callback=self._hotword_audio, @@ -283,7 +280,9 @@ def _init_voice_loop(self, listener_config: dict): stt_audio_callback=self._stt_audio, recording_audio_callback=self._recording_audio, wakeup_callback=self._wakeup, - record_end_callback=self._record_end_signal + record_end_callback=self._record_end_signal, + min_stt_confidence=listener_config.get("min_stt_confidence", 0.6), + max_transcripts=listener_config.get("max_transcripts", 1) ) return loop @@ -659,15 +658,14 @@ def _record_end_signal(self): ) self.bus.emit(Message("recognizer_loop:record_end")) - def _stt_text(self, text: str, stt_context: dict): - if isinstance(text, list): - text = text[0] - + def _stt_text(self, transcripts: List[Tuple[str, float]], + stt_context: dict): # Report utterance to intent service - if text: + if transcripts: + utts = [u[0] for u in transcripts] # filter confidence lang = stt_context.get("lang") or Configuration().get("lang", "en-us") - LOG.debug(f"STT: {text}") - payload = {"utterances": [text], + LOG.debug(f"STT: {utts}") + payload = {"utterances": utts, "lang": lang} self.bus.emit(Message("recognizer_loop:utterance", payload, stt_context)) elif self.voice_loop.listen_mode == ListeningMode.CONTINUOUS: @@ -893,12 +891,16 @@ def _handle_b64_audio(self, message: Message): audio = bytes2audiodata(wav_data) - utterance = self.voice_loop.stt.engine.execute(audio, lang) + utterances = self.voice_loop.stt.transcribe(audio, lang) + filtered = [u for u in utterances if u[1] >= self.voice_loop.min_stt_confidence] + if filtered != utterances: + LOG.info(f"Ignoring low confidence STT transcriptions: {[u for u in utterances if u not in filtered]}") - if utterance: + if filtered: self.bus.emit(message.forward( "recognizer_loop:utterance", - {"utterances": [utterance], "lang": lang})) + {"utterances": [u[0] for u in filtered], + "lang": lang})) else: self.bus.emit(message.forward( "recognizer_loop:speech.recognition.unknown")) diff --git a/ovos_dinkum_listener/voice_loop/voice_loop.py b/ovos_dinkum_listener/voice_loop/voice_loop.py index 176dec6..47694aa 100644 --- a/ovos_dinkum_listener/voice_loop/voice_loop.py +++ b/ovos_dinkum_listener/voice_loop/voice_loop.py @@ -117,6 +117,8 @@ class DinkumVoiceLoop(VoiceLoop): hotword_chunks: Deque = field(default_factory=deque) stt_chunks: Deque = field(default_factory=deque) stt_audio_bytes: bytes = bytes() + min_stt_confidence: float = 0.6 + max_transcripts: int = 1 last_ww: float = -1.0 speech_seconds_left: float = 0.0 silence_seconds_left: float = 0.0 @@ -707,21 +709,28 @@ def _get_tx(self, stt_context: dict) -> (str, dict): # get text and trigger callback try: - text = self.stt.stream_stop() or "" + utts = self.stt.transcribe() or [] except: LOG.exception("STT failed") - text = "" + utts = [] - if not text and self.fallback_stt is not None: + if not utts and self.fallback_stt is not None: LOG.info("Attempting fallback STT plugin") - text = self.fallback_stt.stream_stop() or "" + try: + utts = self.fallback_stt.transcribe() or [] + except: + LOG.exception("Fallback STT failed") + + filtered = [u for u in utts if u[1] >= self.min_stt_confidence] + if filtered != utts: + LOG.info(f"Ignoring low confidence STT transcriptions: {[u for u in utts if u not in filtered]}") + + if len(filtered) > self.max_transcripts: + LOG.debug(f"selecting top {self.max_transcripts} transcriptions") + filtered = filtered[:self.max_transcripts] - # TODO - some plugins return list of transcripts some just text - # standardize support for this - if isinstance(text, list): - text = text[0] - stt_context["transcription"] = text - return text, stt_context + stt_context["transcriptions"] = filtered + return filtered, stt_context def _vad_remove_silence(self): """removes silence from the STT buffer using the VAD plugin @@ -762,11 +771,11 @@ def _after_cmd(self, chunk: bytes): if isinstance(self.stt, FakeStreamingSTT) and self.remove_silence: self._vad_remove_silence() - text, stt_context = self._get_tx(stt_context) + utts, stt_context = self._get_tx(stt_context) - if text: + if utts: LOG.debug(f"transformers metadata: {stt_context}") - LOG.info(f"transcribed: {text}") + LOG.info(f"transcribed: {utts}") else: LOG.info("nothing transcribed") # Voice command has finished recording @@ -781,7 +790,7 @@ def _after_cmd(self, chunk: bytes): # Callback to handle STT text if self.text_callback is not None: - self.text_callback(text, stt_context) + self.text_callback(utts, stt_context) # Back to detecting wake word if self.listen_mode == ListeningMode.CONTINUOUS or \ diff --git a/requirements/requirements.txt b/requirements/requirements.txt index a703e00..360657e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,4 +1,4 @@ -ovos-plugin-manager<0.1.0, >=0.0.26a27 +ovos-plugin-manager<0.1.0, >=0.0.26a28 ovos-utils>=0.0.38 ovos-config<0.1.0, >=0.0.12 ovos-bus-client<0.1.0, >=0.0.7