feat/alt_transcripts

companion to OpenVoiceOS/ovos-plugin-manager#236 and OpenVoiceOS/ovos-dinkum-listener#124
OpenVoiceOS · Jun 18, 2024 · e10a045 · e10a045
1 parent f971a32
commit e10a045
Showing 1 changed file with 67 additions and 116 deletions.
diff --git a/ovos_stt_plugin_chromium/__init__.py b/ovos_stt_plugin_chromium/__init__.py
@@ -6,121 +6,6 @@
 from ovos_plugin_manager.templates.stt import STT
 from ovos_utils.log import LOG
 
-
-class ChromiumSTT(STT):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.pfilter = self.config.get("pfilter", False)
-        self.lang = self.config.get("lang") or self.lang
-
-        # no keys issued since at least march 9 2016
-        # http://web.archive.org/web/20160309230031/http://www.chromium.org/developers/how-tos/api-keys
-        # key scrapped from commit linked bellow, dated Jun 8, 2014
-        # https://github.com/Uberi/speech_recognition/commit/633c2cf54466a748d1db6ad0715c8cbdb27dbb09
-        # let's hope it just keeps on working!
-        default_key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
-
-        self.key = self.config.get("key") or default_key
-        self.debug = self.config.get("debug", False)
-        if not self.debug:
-            log = logging.getLogger("urllib3.connectionpool")
-            log.setLevel("INFO")
-
-    def transcribe(self, audio, lang: Optional[str] = None) -> List[Tuple[str, float]]:
-        """transcribe audio data to a list of
-        possible transcriptions and respective confidences"""
-        flac_data = audio.get_flac_data(
-            convert_rate=None if audio.sample_rate >= 8000 else 8000,
-            # audio samples must be at least 8 kHz
-            convert_width=2  # audio samples must be 16-bit
-        )
-
-        params = {
-            "client": "chromium",
-            "lang": lang or self.lang,
-            "key": self.key,
-            "pFilter": int(self.pfilter)
-        }
-        sample_rate = str(audio.sample_rate)
-        headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate}
-        url = "http://www.google.com/speech-api/v2/recognize"
-        r = requests.post(url, headers=headers, data=flac_data, params=params)
-
-        # weirdly this returns something like
-        """
-        {"result":[]}
-        {"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0}
-        """
-
-        result = r.text.split("\n")[1]
-        if not result:
-            return []
-        data = json.loads(result)["result"]
-        if len(data) == 0:
-            return ""
-        data = data[0]["alternative"]
-        if self.debug:
-            LOG.debug("transcriptions:" + str(data))
-        if len(data) == 0:
-            return []
-
-        candidates = [(u["transcript"], u.get("confidence", 0.0))
-                      for u in data]
-        return sorted(candidates, key=lambda alt: alt[1], reverse=True)
-
-    def execute(self, audio, language=None) -> str:
-        flac_data = audio.get_flac_data(
-            convert_rate=None if audio.sample_rate >= 8000 else 8000,
-            # audio samples must be at least 8 kHz
-            convert_width=2  # audio samples must be 16-bit
-        )
-
-        params = {
-            "client": "chromium",
-            "lang": language or self.lang,
-            "key": self.key,
-            "pFilter": int(self.pfilter)
-        }
-        sample_rate = str(audio.sample_rate)
-        headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate}
-        url = "http://www.google.com/speech-api/v2/recognize"
-        r = requests.post(url, headers=headers, data=flac_data, params=params)
-
-        # weirdly this returns something like
-        """
-        {"result":[]}
-        {"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0}
-        """
-
-        result = r.text.split("\n")[1]
-        if not result:
-            return ""
-        data = json.loads(result)["result"]
-        if len(data) == 0:
-            return ""
-        data = data[0]["alternative"]
-        if self.debug:
-            LOG.debug("transcriptions:" + str(data))
-        if len(data) == 0:
-            return ""
-
-        # we arbitrarily choose the first hypothesis by default.
-        # results seem to be ordered by confidence
-        best_hypothesis = data[0]["transcript"]
-
-        # if confidence is provided return highest conf
-        candidates = [alt for alt in data if alt.get("confidence")]
-        if self.debug:
-            LOG.debug("confidences: " + str(candidates))
-
-        if len(candidates):
-            best = max(candidates, key=lambda alt: alt["confidence"])
-            best_hypothesis = best["transcript"]
-            if self.debug:
-                LOG.debug("best confidence: " + best_hypothesis)
-        return best_hypothesis
-
-
 # taken from https://stackoverflow.com/questions/14257598/what-are-language-codes-in-chromes-implementation-of-the-html5-speech-recogniti/14302134#14302134
 _lang = {
     "Afrikaans": [
@@ -316,6 +201,72 @@ def execute(self, audio, language=None) -> str:
              }
         ]
 
+
+class ChromiumSTT(STT):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.pfilter = self.config.get("pfilter", False)
+        self.lang = self.config.get("lang") or self.lang
+
+        # no keys issued since at least march 9 2016
+        # http://web.archive.org/web/20160309230031/http://www.chromium.org/developers/how-tos/api-keys
+        # key scrapped from commit linked bellow, dated Jun 8, 2014
+        # https://github.com/Uberi/speech_recognition/commit/633c2cf54466a748d1db6ad0715c8cbdb27dbb09
+        # let's hope it just keeps on working!
+        default_key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
+
+        self.key = self.config.get("key") or default_key
+        self.debug = self.config.get("debug", False)
+        if not self.debug:
+            log = logging.getLogger("urllib3.connectionpool")
+            log.setLevel("INFO")
+
+    def transcribe(self, audio, lang: Optional[str] = None) -> List[Tuple[str, float]]:
+        """transcribe audio data to a list of
+        possible transcriptions and respective confidences"""
+        flac_data = audio.get_flac_data(
+            convert_rate=None if audio.sample_rate >= 8000 else 8000,
+            # audio samples must be at least 8 kHz
+            convert_width=2  # audio samples must be 16-bit
+        )
+
+        params = {
+            "client": "chromium",
+            "lang": lang or self.lang,
+            "key": self.key,
+            "pFilter": int(self.pfilter)
+        }
+        sample_rate = str(audio.sample_rate)
+        headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate}
+        url = "http://www.google.com/speech-api/v2/recognize"
+        r = requests.post(url, headers=headers, data=flac_data, params=params)
+
+        # weirdly this returns something like
+        """
+        {"result":[]}
+        {"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0}
+        """
+
+        result = r.text.split("\n")[1]
+        if not result:
+            return []
+        data = json.loads(result)["result"]
+        if len(data) == 0:
+            return ""
+        data = data[0]["alternative"]
+        if self.debug:
+            LOG.debug("transcriptions:" + str(data))
+        if len(data) == 0:
+            return []
+
+        candidates = [(u["transcript"], u.get("confidence", 0.0))
+                      for u in data]
+        return sorted(candidates, key=lambda alt: alt[1], reverse=True)
+
+    def execute(self, audio, language=None) -> str:
+        return self.transcribe(audio, language)[0][0]
+
+
 if __name__ == "__main__":
     b = ChromiumSTT()
     from speech_recognition import Recognizer, AudioFile
@@ -325,4 +276,4 @@ def execute(self, audio, language=None) -> str:
         audio = Recognizer().record(source)
 
     a = b.transcribe(audio, "en-us")
-    print(a)
+    print(a)