Skip to content

Commit

Permalink
feat/alternative_transcripts (#124)
Browse files Browse the repository at this point in the history
* feat/alternative_transcripts

closes OpenVoiceOS/ovos-plugin-manager#46

needs OpenVoiceOS/ovos-plugin-manager#236

* .

* Update requirements.txt

* Update voice_loop.py

* configurable min_conf

* from mycroft.conf

* max_transcripts
  • Loading branch information
JarbasAl authored Jun 20, 2024
1 parent 191bbd7 commit 5a794e8
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 39 deletions.
24 changes: 19 additions & 5 deletions ovos_dinkum_listener/plugins.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, List, Tuple

from ovos_config.config import Configuration
from ovos_plugin_manager.stt import OVOSSTTFactory
from ovos_plugin_manager.templates.stt import StreamingSTT, StreamThread
from ovos_plugin_manager.utils import ReadWriteStream
from ovos_config.config import Configuration
from ovos_utils.log import LOG
from speech_recognition import AudioData

Expand All @@ -18,11 +19,11 @@ def __init__(self, queue, language, engine, sample_rate, sample_width):

def finalize(self):
""" return final transcription """
if not self.buffer:

if not self.buffer:
return ""

try:
try:
# plugins expect AudioData objects
audio = AudioData(self.buffer.read(),
sample_rate=self.sample_rate,
Expand Down Expand Up @@ -55,6 +56,19 @@ def create_streaming_thread(self):
return FakeStreamThread(self.queue, self.lang, self.engine, sample_rate,
sample_width)

def transcribe(self, audio: Optional = None,
lang: Optional[str] = None) -> List[Tuple[str, float]]:
"""transcribe audio data to a list of
possible transcriptions and respective confidences"""
# plugins expect AudioData objects
audiod = AudioData(audio or self.stream.buffer.read(),
sample_rate=self.stream.sample_rate,
sample_width=self.stream.sample_width)
transcripts = self.engine.transcribe(audiod, lang)
if audio is None:
self.stream.buffer.clear()
return transcripts


def load_stt_module(config: Dict[str, Any] = None) -> StreamingSTT:
"""
Expand Down
40 changes: 21 additions & 19 deletions ovos_dinkum_listener/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from pathlib import Path
from tempfile import NamedTemporaryFile
from threading import Thread, RLock, Event
from typing import List, Tuple

import speech_recognition as sr
from distutils.spawn import find_executable
Expand Down Expand Up @@ -261,19 +262,15 @@ def _init_voice_loop(self, listener_config: dict):
fallback_stt=self.fallback_stt,
vad=self.vad,
transformers=self.transformers,
instant_listen=listener_config.get("instant_listen"),
instant_listen=listener_config.get("instant_listen", True),
speech_seconds=listener_config.get("speech_begin", 0.3),
silence_seconds=listener_config.get("silence_end", 0.7),
timeout_seconds=listener_config.get("recording_timeout", 10),
timeout_seconds_with_silence=listener_config.get("recording_timeout_with_silence", 5),
recording_mode_max_silence_seconds=listener_config.get("recording_mode_max_silence_seconds", 30),
num_stt_rewind_chunks=listener_config.get(
"utterance_chunks_to_rewind", 2),
num_hotword_keep_chunks=listener_config.get(
"wakeword_chunks_to_save", 15),
remove_silence=listener_config.get(
"remove_silence", False),
#
num_stt_rewind_chunks=listener_config.get("utterance_chunks_to_rewind", 2),
num_hotword_keep_chunks=listener_config.get("wakeword_chunks_to_save", 15),
remove_silence=listener_config.get("remove_silence", False),
wake_callback=self._record_begin,
text_callback=self._stt_text,
listenword_audio_callback=self._hotword_audio,
Expand All @@ -283,7 +280,9 @@ def _init_voice_loop(self, listener_config: dict):
stt_audio_callback=self._stt_audio,
recording_audio_callback=self._recording_audio,
wakeup_callback=self._wakeup,
record_end_callback=self._record_end_signal
record_end_callback=self._record_end_signal,
min_stt_confidence=listener_config.get("min_stt_confidence", 0.6),
max_transcripts=listener_config.get("max_transcripts", 1)
)
return loop

Expand Down Expand Up @@ -659,15 +658,14 @@ def _record_end_signal(self):
)
self.bus.emit(Message("recognizer_loop:record_end"))

def _stt_text(self, text: str, stt_context: dict):
if isinstance(text, list):
text = text[0]

def _stt_text(self, transcripts: List[Tuple[str, float]],
stt_context: dict):
# Report utterance to intent service
if text:
if transcripts:
utts = [u[0] for u in transcripts] # filter confidence
lang = stt_context.get("lang") or Configuration().get("lang", "en-us")
LOG.debug(f"STT: {text}")
payload = {"utterances": [text],
LOG.debug(f"STT: {utts}")
payload = {"utterances": utts,
"lang": lang}
self.bus.emit(Message("recognizer_loop:utterance", payload, stt_context))
elif self.voice_loop.listen_mode == ListeningMode.CONTINUOUS:
Expand Down Expand Up @@ -893,12 +891,16 @@ def _handle_b64_audio(self, message: Message):

audio = bytes2audiodata(wav_data)

utterance = self.voice_loop.stt.engine.execute(audio, lang)
utterances = self.voice_loop.stt.transcribe(audio, lang)
filtered = [u for u in utterances if u[1] >= self.voice_loop.min_stt_confidence]
if filtered != utterances:
LOG.info(f"Ignoring low confidence STT transcriptions: {[u for u in utterances if u not in filtered]}")

if utterance:
if filtered:
self.bus.emit(message.forward(
"recognizer_loop:utterance",
{"utterances": [utterance], "lang": lang}))
{"utterances": [u[0] for u in filtered],
"lang": lang}))
else:
self.bus.emit(message.forward(
"recognizer_loop:speech.recognition.unknown"))
Expand Down
37 changes: 23 additions & 14 deletions ovos_dinkum_listener/voice_loop/voice_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ class DinkumVoiceLoop(VoiceLoop):
hotword_chunks: Deque = field(default_factory=deque)
stt_chunks: Deque = field(default_factory=deque)
stt_audio_bytes: bytes = bytes()
min_stt_confidence: float = 0.6
max_transcripts: int = 1
last_ww: float = -1.0
speech_seconds_left: float = 0.0
silence_seconds_left: float = 0.0
Expand Down Expand Up @@ -707,21 +709,28 @@ def _get_tx(self, stt_context: dict) -> (str, dict):

# get text and trigger callback
try:
text = self.stt.stream_stop() or ""
utts = self.stt.transcribe() or []
except:
LOG.exception("STT failed")
text = ""
utts = []

if not text and self.fallback_stt is not None:
if not utts and self.fallback_stt is not None:
LOG.info("Attempting fallback STT plugin")
text = self.fallback_stt.stream_stop() or ""
try:
utts = self.fallback_stt.transcribe() or []
except:
LOG.exception("Fallback STT failed")

filtered = [u for u in utts if u[1] >= self.min_stt_confidence]
if filtered != utts:
LOG.info(f"Ignoring low confidence STT transcriptions: {[u for u in utts if u not in filtered]}")

if len(filtered) > self.max_transcripts:
LOG.debug(f"selecting top {self.max_transcripts} transcriptions")
filtered = filtered[:self.max_transcripts]

# TODO - some plugins return list of transcripts some just text
# standardize support for this
if isinstance(text, list):
text = text[0]
stt_context["transcription"] = text
return text, stt_context
stt_context["transcriptions"] = filtered
return filtered, stt_context

def _vad_remove_silence(self):
"""removes silence from the STT buffer using the VAD plugin
Expand Down Expand Up @@ -762,11 +771,11 @@ def _after_cmd(self, chunk: bytes):
if isinstance(self.stt, FakeStreamingSTT) and self.remove_silence:
self._vad_remove_silence()

text, stt_context = self._get_tx(stt_context)
utts, stt_context = self._get_tx(stt_context)

if text:
if utts:
LOG.debug(f"transformers metadata: {stt_context}")
LOG.info(f"transcribed: {text}")
LOG.info(f"transcribed: {utts}")
else:
LOG.info("nothing transcribed")
# Voice command has finished recording
Expand All @@ -781,7 +790,7 @@ def _after_cmd(self, chunk: bytes):

# Callback to handle STT text
if self.text_callback is not None:
self.text_callback(text, stt_context)
self.text_callback(utts, stt_context)

# Back to detecting wake word
if self.listen_mode == ListeningMode.CONTINUOUS or \
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ovos-plugin-manager<0.1.0, >=0.0.26a27
ovos-plugin-manager<0.1.0, >=0.0.26a28
ovos-utils>=0.0.38
ovos-config<0.1.0, >=0.0.12
ovos-bus-client<0.1.0, >=0.0.7
Expand Down

0 comments on commit 5a794e8

Please sign in to comment.