feat/alt_transcripts

companion to OpenVoiceOS/ovos-plugin-manager#236 and OpenVoiceOS/ovos-dinkum-listener#124
OpenVoiceOS · Jun 18, 2024 · f971a32 · f971a32
1 parent ba015b3
commit f971a32
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 68 deletions.
diff --git a/ovos_stt_plugin_chromium/__init__.py b/ovos_stt_plugin_chromium/__init__.py
@@ -1,10 +1,126 @@
 import json
 import logging
+from typing import List, Tuple, Optional
 
 import requests
 from ovos_plugin_manager.templates.stt import STT
 from ovos_utils.log import LOG
 
+
+class ChromiumSTT(STT):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.pfilter = self.config.get("pfilter", False)
+        self.lang = self.config.get("lang") or self.lang
+
+        # no keys issued since at least march 9 2016
+        # http://web.archive.org/web/20160309230031/http://www.chromium.org/developers/how-tos/api-keys
+        # key scrapped from commit linked bellow, dated Jun 8, 2014
+        # https://github.com/Uberi/speech_recognition/commit/633c2cf54466a748d1db6ad0715c8cbdb27dbb09
+        # let's hope it just keeps on working!
+        default_key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
+
+        self.key = self.config.get("key") or default_key
+        self.debug = self.config.get("debug", False)
+        if not self.debug:
+            log = logging.getLogger("urllib3.connectionpool")
+            log.setLevel("INFO")
+
+    def transcribe(self, audio, lang: Optional[str] = None) -> List[Tuple[str, float]]:
+        """transcribe audio data to a list of
+        possible transcriptions and respective confidences"""
+        flac_data = audio.get_flac_data(
+            convert_rate=None if audio.sample_rate >= 8000 else 8000,
+            # audio samples must be at least 8 kHz
+            convert_width=2  # audio samples must be 16-bit
+        )
+
+        params = {
+            "client": "chromium",
+            "lang": lang or self.lang,
+            "key": self.key,
+            "pFilter": int(self.pfilter)
+        }
+        sample_rate = str(audio.sample_rate)
+        headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate}
+        url = "http://www.google.com/speech-api/v2/recognize"
+        r = requests.post(url, headers=headers, data=flac_data, params=params)
+
+        # weirdly this returns something like
+        """
+        {"result":[]}
+        {"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0}
+        """
+
+        result = r.text.split("\n")[1]
+        if not result:
+            return []
+        data = json.loads(result)["result"]
+        if len(data) == 0:
+            return ""
+        data = data[0]["alternative"]
+        if self.debug:
+            LOG.debug("transcriptions:" + str(data))
+        if len(data) == 0:
+            return []
+
+        candidates = [(u["transcript"], u.get("confidence", 0.0))
+                      for u in data]
+        return sorted(candidates, key=lambda alt: alt[1], reverse=True)
+
+    def execute(self, audio, language=None) -> str:
+        flac_data = audio.get_flac_data(
+            convert_rate=None if audio.sample_rate >= 8000 else 8000,
+            # audio samples must be at least 8 kHz
+            convert_width=2  # audio samples must be 16-bit
+        )
+
+        params = {
+            "client": "chromium",
+            "lang": language or self.lang,
+            "key": self.key,
+            "pFilter": int(self.pfilter)
+        }
+        sample_rate = str(audio.sample_rate)
+        headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate}
+        url = "http://www.google.com/speech-api/v2/recognize"
+        r = requests.post(url, headers=headers, data=flac_data, params=params)
+
+        # weirdly this returns something like
+        """
+        {"result":[]}
+        {"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0}
+        """
+
+        result = r.text.split("\n")[1]
+        if not result:
+            return ""
+        data = json.loads(result)["result"]
+        if len(data) == 0:
+            return ""
+        data = data[0]["alternative"]
+        if self.debug:
+            LOG.debug("transcriptions:" + str(data))
+        if len(data) == 0:
+            return ""
+
+        # we arbitrarily choose the first hypothesis by default.
+        # results seem to be ordered by confidence
+        best_hypothesis = data[0]["transcript"]
+
+        # if confidence is provided return highest conf
+        candidates = [alt for alt in data if alt.get("confidence")]
+        if self.debug:
+            LOG.debug("confidences: " + str(candidates))
+
+        if len(candidates):
+            best = max(candidates, key=lambda alt: alt["confidence"])
+            best_hypothesis = best["transcript"]
+            if self.debug:
+                LOG.debug("best confidence: " + best_hypothesis)
+        return best_hypothesis
+
+
 # taken from https://stackoverflow.com/questions/14257598/what-are-language-codes-in-chromes-implementation-of-the-html5-speech-recogniti/14302134#14302134
 _lang = {
     "Afrikaans": [
@@ -200,72 +316,13 @@
              }
         ]
 
+if __name__ == "__main__":
+    b = ChromiumSTT()
+    from speech_recognition import Recognizer, AudioFile
 
-class ChromiumSTT(STT):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.pfilter = self.config.get("pfilter", False)
-        self.lang = self.config.get("lang") or self.lang
-
-        # no keys issued since at least march 9 2016
-        # http://web.archive.org/web/20160309230031/http://www.chromium.org/developers/how-tos/api-keys
-        # key scrapped from commit linked bellow, dated Jun 8, 2014
-        # https://github.com/Uberi/speech_recognition/commit/633c2cf54466a748d1db6ad0715c8cbdb27dbb09
-        # let's hope it just keeps on working!
-        default_key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
-
-        self.key = self.config.get("key") or default_key
-        self.debug = self.config.get("debug", False)
-        if not self.debug:
-            log = logging.getLogger("urllib3.connectionpool")
-            log.setLevel("INFO")
-
-    def execute(self, audio, language=None):
-        flac_data = audio.get_flac_data(
-            convert_rate=None if audio.sample_rate >= 8000 else 8000,
-            # audio samples must be at least 8 kHz
-            convert_width=2  # audio samples must be 16-bit
-        )
-
-        params = {
-            "client": "chromium",
-            "lang": language or self.lang,
-            "key": self.key,
-            "pFilter": int(self.pfilter)
-        }
-        sample_rate = str(audio.sample_rate)
-        headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate}
-        url = "http://www.google.com/speech-api/v2/recognize"
-        r = requests.post(url, headers=headers, data=flac_data, params=params)
-
-        # weirdly this returns something like
-        """
-        {"result":[]}
-        {"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0}
-        """
-
-        result = r.text.split("\n")[1]
-        data = json.loads(result)["result"]
-        if len(data) == 0:
-            return ""
-        data = data[0]["alternative"]
-        if self.debug:
-            LOG.debug("transcriptions:" + str(data))
-        if len(data) == 0:
-            return ""
-
-        # we arbitrarily choose the first hypothesis by default.
-        # results seem to be ordered by confidence
-        best_hypothesis = data[0]["transcript"]
-
-        # if confidence is provided return highest conf
-        candidates = [alt for alt in data if alt.get("confidence")]
-        if self.debug:
-            LOG.debug("confidences: " + str(candidates))
+    jfk = f"test.flac"
+    with AudioFile(jfk) as source:
+        audio = Recognizer().record(source)
 
-        if len(candidates):
-            best = max(candidates, key=lambda alt: alt["confidence"])
-            best_hypothesis = best["transcript"]
-            if self.debug:
-                LOG.debug("best confidence: " + best_hypothesis)
-        return best_hypothesis
+    a = b.transcribe(audio, "en-us")
+    print(a)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 requests
 ovos_utils>=0.0.8a3
-ovos-plugin-manager>=0.0.1a7
+ovos-plugin-manager<0.1.0, >=0.0.26a28