Skip to content

Commit

Permalink
feat/alt_transcripts
Browse files Browse the repository at this point in the history
  • Loading branch information
JarbasAl committed Jun 18, 2024
1 parent ba015b3 commit f971a32
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 68 deletions.
191 changes: 124 additions & 67 deletions ovos_stt_plugin_chromium/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,126 @@
import json
import logging
from typing import List, Tuple, Optional

import requests
from ovos_plugin_manager.templates.stt import STT
from ovos_utils.log import LOG


class ChromiumSTT(STT):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.pfilter = self.config.get("pfilter", False)
self.lang = self.config.get("lang") or self.lang

# no keys issued since at least march 9 2016
# http://web.archive.org/web/20160309230031/http://www.chromium.org/developers/how-tos/api-keys
# key scrapped from commit linked bellow, dated Jun 8, 2014
# https://github.com/Uberi/speech_recognition/commit/633c2cf54466a748d1db6ad0715c8cbdb27dbb09
# let's hope it just keeps on working!
default_key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"

self.key = self.config.get("key") or default_key
self.debug = self.config.get("debug", False)
if not self.debug:
log = logging.getLogger("urllib3.connectionpool")
log.setLevel("INFO")

def transcribe(self, audio, lang: Optional[str] = None) -> List[Tuple[str, float]]:
"""transcribe audio data to a list of
possible transcriptions and respective confidences"""
flac_data = audio.get_flac_data(
convert_rate=None if audio.sample_rate >= 8000 else 8000,
# audio samples must be at least 8 kHz
convert_width=2 # audio samples must be 16-bit
)

params = {
"client": "chromium",
"lang": lang or self.lang,
"key": self.key,
"pFilter": int(self.pfilter)
}
sample_rate = str(audio.sample_rate)
headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate}
url = "http://www.google.com/speech-api/v2/recognize"
r = requests.post(url, headers=headers, data=flac_data, params=params)

# weirdly this returns something like
"""
{"result":[]}
{"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0}
"""

result = r.text.split("\n")[1]
if not result:
return []
data = json.loads(result)["result"]
if len(data) == 0:
return ""
data = data[0]["alternative"]
if self.debug:
LOG.debug("transcriptions:" + str(data))
if len(data) == 0:
return []

candidates = [(u["transcript"], u.get("confidence", 0.0))
for u in data]
return sorted(candidates, key=lambda alt: alt[1], reverse=True)

def execute(self, audio, language=None) -> str:
flac_data = audio.get_flac_data(
convert_rate=None if audio.sample_rate >= 8000 else 8000,
# audio samples must be at least 8 kHz
convert_width=2 # audio samples must be 16-bit
)

params = {
"client": "chromium",
"lang": language or self.lang,
"key": self.key,
"pFilter": int(self.pfilter)
}
sample_rate = str(audio.sample_rate)
headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate}
url = "http://www.google.com/speech-api/v2/recognize"
r = requests.post(url, headers=headers, data=flac_data, params=params)

# weirdly this returns something like
"""
{"result":[]}
{"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0}
"""

result = r.text.split("\n")[1]
if not result:
return ""
data = json.loads(result)["result"]
if len(data) == 0:
return ""
data = data[0]["alternative"]
if self.debug:
LOG.debug("transcriptions:" + str(data))
if len(data) == 0:
return ""

# we arbitrarily choose the first hypothesis by default.
# results seem to be ordered by confidence
best_hypothesis = data[0]["transcript"]

# if confidence is provided return highest conf
candidates = [alt for alt in data if alt.get("confidence")]
if self.debug:
LOG.debug("confidences: " + str(candidates))

if len(candidates):
best = max(candidates, key=lambda alt: alt["confidence"])
best_hypothesis = best["transcript"]
if self.debug:
LOG.debug("best confidence: " + best_hypothesis)
return best_hypothesis


# taken from https://stackoverflow.com/questions/14257598/what-are-language-codes-in-chromes-implementation-of-the-html5-speech-recogniti/14302134#14302134
_lang = {
"Afrikaans": [
Expand Down Expand Up @@ -200,72 +316,13 @@
}
]

if __name__ == "__main__":
b = ChromiumSTT()
from speech_recognition import Recognizer, AudioFile

class ChromiumSTT(STT):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.pfilter = self.config.get("pfilter", False)
self.lang = self.config.get("lang") or self.lang

# no keys issued since at least march 9 2016
# http://web.archive.org/web/20160309230031/http://www.chromium.org/developers/how-tos/api-keys
# key scrapped from commit linked bellow, dated Jun 8, 2014
# https://github.com/Uberi/speech_recognition/commit/633c2cf54466a748d1db6ad0715c8cbdb27dbb09
# let's hope it just keeps on working!
default_key = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"

self.key = self.config.get("key") or default_key
self.debug = self.config.get("debug", False)
if not self.debug:
log = logging.getLogger("urllib3.connectionpool")
log.setLevel("INFO")

def execute(self, audio, language=None):
flac_data = audio.get_flac_data(
convert_rate=None if audio.sample_rate >= 8000 else 8000,
# audio samples must be at least 8 kHz
convert_width=2 # audio samples must be 16-bit
)

params = {
"client": "chromium",
"lang": language or self.lang,
"key": self.key,
"pFilter": int(self.pfilter)
}
sample_rate = str(audio.sample_rate)
headers = {"Content-Type": "audio/x-flac; rate=" + sample_rate}
url = "http://www.google.com/speech-api/v2/recognize"
r = requests.post(url, headers=headers, data=flac_data, params=params)

# weirdly this returns something like
"""
{"result":[]}
{"result":[{"alternative":[{"transcript":"Hello world","confidence":0.83848035},{"transcript":"hello hello"},{"transcript":"Hello"},{"transcript":"Hello old"},{"transcript":"Hello howdy"}],"final":true}],"result_index":0}
"""

result = r.text.split("\n")[1]
data = json.loads(result)["result"]
if len(data) == 0:
return ""
data = data[0]["alternative"]
if self.debug:
LOG.debug("transcriptions:" + str(data))
if len(data) == 0:
return ""

# we arbitrarily choose the first hypothesis by default.
# results seem to be ordered by confidence
best_hypothesis = data[0]["transcript"]

# if confidence is provided return highest conf
candidates = [alt for alt in data if alt.get("confidence")]
if self.debug:
LOG.debug("confidences: " + str(candidates))
jfk = f"test.flac"
with AudioFile(jfk) as source:
audio = Recognizer().record(source)

if len(candidates):
best = max(candidates, key=lambda alt: alt["confidence"])
best_hypothesis = best["transcript"]
if self.debug:
LOG.debug("best confidence: " + best_hypothesis)
return best_hypothesis
a = b.transcribe(audio, "en-us")
print(a)
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
requests
ovos_utils>=0.0.8a3
ovos-plugin-manager>=0.0.1a7
ovos-plugin-manager<0.1.0, >=0.0.26a28

0 comments on commit f971a32

Please sign in to comment.