diff --git a/ovos_stt_plugin_chromium/__init__.py b/ovos_stt_plugin_chromium/__init__.py index f9cc427..fb424e0 100644 --- a/ovos_stt_plugin_chromium/__init__.py +++ b/ovos_stt_plugin_chromium/__init__.py @@ -6,121 +6,6 @@ from ovos_plugin_manager.templates.stt import STT from ovos_utils.log import LOG - -class ChromiumSTT(STT): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.pfilter = self.config.get("pfilter", False) - self.lang = self.config.get("lang") or self.lang - - # no keys issued since at least march 9 2016 - # http://web.archive.org/web/20160309230031/http://www.chromium.org/developers/how-tos/api-keys - # key scrapped from commit linked bellow, dated Jun 8, 2014 - # https://github.com/Uberi/speech_recognition/commit/633c2cf54466a748d1db6ad0715c8cbdb27dbb09 - # let's hope it just keeps on working! - default_key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" - - self.key = self.config.get("key") or default_key - self.debug = self.config.get("debug", False) - if not self.debug: - log = logging.getLogger("urllib3.connectionpool") - log.setLevel("INFO") - - def transcribe(self, audio, lang: Optional[str] = None) -> List[Tuple[str, float]]: - """transcribe audio data to a list of - possible transcriptions and respective confidences""" - flac_data = audio.get_flac_data( - convert_rate=None if audio.sample_rate >= 8000 else 8000, - # audio samples must be at least 8 kHz - convert_width=2 # audio samples must be 16-bit - ) - - params = { - "client": "chromium", - "lang": lang or self.lang, - "key": self.key, - "pFilter": int(self.pfilter) - } - sample_rate = str(audio.sample_rate) - headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate} - url = "http://www.google.com/speech-api/v2/recognize" - r = requests.post(url, headers=headers, data=flac_data, params=params) - - # weirdly this returns something like - """ - {"result":[]} - {"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0} - """ - - result = r.text.split("\n")[1] - if not result: - return [] - data = json.loads(result)["result"] - if len(data) == 0: - return "" - data = data[0]["alternative"] - if self.debug: - LOG.debug("transcriptions:" + str(data)) - if len(data) == 0: - return [] - - candidates = [(u["transcript"], u.get("confidence", 0.0)) - for u in data] - return sorted(candidates, key=lambda alt: alt[1], reverse=True) - - def execute(self, audio, language=None) -> str: - flac_data = audio.get_flac_data( - convert_rate=None if audio.sample_rate >= 8000 else 8000, - # audio samples must be at least 8 kHz - convert_width=2 # audio samples must be 16-bit - ) - - params = { - "client": "chromium", - "lang": language or self.lang, - "key": self.key, - "pFilter": int(self.pfilter) - } - sample_rate = str(audio.sample_rate) - headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate} - url = "http://www.google.com/speech-api/v2/recognize" - r = requests.post(url, headers=headers, data=flac_data, params=params) - - # weirdly this returns something like - """ - {"result":[]} - {"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0} - """ - - result = r.text.split("\n")[1] - if not result: - return "" - data = json.loads(result)["result"] - if len(data) == 0: - return "" - data = data[0]["alternative"] - if self.debug: - LOG.debug("transcriptions:" + str(data)) - if len(data) == 0: - return "" - - # we arbitrarily choose the first hypothesis by default. - # results seem to be ordered by confidence - best_hypothesis = data[0]["transcript"] - - # if confidence is provided return highest conf - candidates = [alt for alt in data if alt.get("confidence")] - if self.debug: - LOG.debug("confidences: " + str(candidates)) - - if len(candidates): - best = max(candidates, key=lambda alt: alt["confidence"]) - best_hypothesis = best["transcript"] - if self.debug: - LOG.debug("best confidence: " + best_hypothesis) - return best_hypothesis - - # taken from https://stackoverflow.com/questions/14257598/what-are-language-codes-in-chromes-implementation-of-the-html5-speech-recogniti/14302134#14302134 _lang = { "Afrikaans": [ @@ -316,6 +201,72 @@ def execute(self, audio, language=None) -> str: } ] + +class ChromiumSTT(STT): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.pfilter = self.config.get("pfilter", False) + self.lang = self.config.get("lang") or self.lang + + # no keys issued since at least march 9 2016 + # http://web.archive.org/web/20160309230031/http://www.chromium.org/developers/how-tos/api-keys + # key scrapped from commit linked bellow, dated Jun 8, 2014 + # https://github.com/Uberi/speech_recognition/commit/633c2cf54466a748d1db6ad0715c8cbdb27dbb09 + # let's hope it just keeps on working! + default_key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw" + + self.key = self.config.get("key") or default_key + self.debug = self.config.get("debug", False) + if not self.debug: + log = logging.getLogger("urllib3.connectionpool") + log.setLevel("INFO") + + def transcribe(self, audio, lang: Optional[str] = None) -> List[Tuple[str, float]]: + """transcribe audio data to a list of + possible transcriptions and respective confidences""" + flac_data = audio.get_flac_data( + convert_rate=None if audio.sample_rate >= 8000 else 8000, + # audio samples must be at least 8 kHz + convert_width=2 # audio samples must be 16-bit + ) + + params = { + "client": "chromium", + "lang": lang or self.lang, + "key": self.key, + "pFilter": int(self.pfilter) + } + sample_rate = str(audio.sample_rate) + headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate} + url = "http://www.google.com/speech-api/v2/recognize" + r = requests.post(url, headers=headers, data=flac_data, params=params) + + # weirdly this returns something like + """ + {"result":[]} + {"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0} + """ + + result = r.text.split("\n")[1] + if not result: + return [] + data = json.loads(result)["result"] + if len(data) == 0: + return "" + data = data[0]["alternative"] + if self.debug: + LOG.debug("transcriptions:" + str(data)) + if len(data) == 0: + return [] + + candidates = [(u["transcript"], u.get("confidence", 0.0)) + for u in data] + return sorted(candidates, key=lambda alt: alt[1], reverse=True) + + def execute(self, audio, language=None) -> str: + return self.transcribe(audio, language)[0][0] + + if __name__ == "__main__": b = ChromiumSTT() from speech_recognition import Recognizer, AudioFile @@ -325,4 +276,4 @@ def execute(self, audio, language=None) -> str: audio = Recognizer().record(source) a = b.transcribe(audio, "en-us") - print(a) \ No newline at end of file + print(a)