Skip to content

Commit

Permalink
Add large-v3 (#7)
Browse files Browse the repository at this point in the history
* Add large-v3

* Update README

* Set minimum requirement for faster-whisper
  • Loading branch information
goldyfruit authored Dec 5, 2023
1 parent 6218fe8 commit 6d3d5ac
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 22 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp

## Configuration

available models are `"tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium", "large-v2"`
available models are `"tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium", "large-v2", "large-v3"`

eg, to use Large model with GPU

Expand All @@ -21,7 +21,7 @@ To use Whisper as STT
"stt": {
"module": "ovos-stt-plugin-fasterwhisper",
"ovos-stt-plugin-fasterwhisper": {
"model": "large-v2",
"model": "large-v3",
"use_cuda": true,
"compute_type": "float16",
"beam_size": 5,
Expand All @@ -30,7 +30,7 @@ To use Whisper as STT
}
```

To use Whisper for lang detection (ovos-dinkum-listener only)
To use Whisper for lang detection (ovos-dinkum-listener only)


```json
Expand Down
66 changes: 48 additions & 18 deletions ovos_stt_plugin_fasterwhisper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ def __init__(self, config=None):

@property
def valid_langs(self) -> List[str]:
return list(set([get_default_lang()] +
Configuration().get("secondary_langs", [])))
return list(
set([get_default_lang()] + Configuration().get("secondary_langs", []))
)

@staticmethod
def audiochunk2array(audio_data):
Expand All @@ -44,13 +45,12 @@ def audiochunk2array(audio_data):
audio_as_np_float32 = audio_as_np_int16.astype(np.float32)

# Normalise float32 array so that values are between -1.0 and +1.0
max_int16 = 2 ** 15
max_int16 = 2**15
data = audio_as_np_float32 / max_int16
return data

def detect(self, audio, valid_langs=None):
valid_langs = [l.lower().split("-")[0]
for l in valid_langs or self.valid_langs]
valid_langs = [l.lower().split("-")[0] for l in valid_langs or self.valid_langs]

if not self.engine.model.is_multilingual:
language = "en"
Expand All @@ -68,7 +68,9 @@ def detect(self, audio, valid_langs=None):
results = self.engine.model.detect_language(encoder_output)[0]
results = [(l[2:-2], p) for l, p in results if l[2:-2] in valid_langs]
total = sum(l[1] for l in results) or 1
results = sorted([(l, p / total) for l, p in results], key=lambda k: k[1], reverse=True)
results = sorted(
[(l, p / total) for l, p in results], key=lambda k: k[1], reverse=True
)

language, language_probability = results[0]
return language, language_probability
Expand All @@ -81,7 +83,19 @@ def transform(self, audio_data):


class FasterWhisperSTT(STT):
MODELS = ("tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium", "large", "large-v2")
MODELS = (
"tiny.en",
"tiny",
"base.en",
"base",
"small.en",
"small",
"medium.en",
"medium",
"large",
"large-v2",
"large-v3",
)
LANGUAGES = {
"en": "english",
"zh": "chinese",
Expand Down Expand Up @@ -200,7 +214,12 @@ def __init__(self, *args, **kwargs):
device = "cuda"
else:
device = "cpu"
self.engine = WhisperModel(model, device=device, compute_type=self.compute_type, cpu_threads=self.cpu_threads)
self.engine = WhisperModel(
model,
device=device,
compute_type=self.compute_type,
cpu_threads=self.cpu_threads,
)

@staticmethod
def audiodata2array(audio_data):
Expand All @@ -209,8 +228,12 @@ def audiodata2array(audio_data):

def execute(self, audio, language=None):
lang = language or self.lang
segments, _ = self.engine.transcribe(self.audiodata2array(audio), beam_size=self.beam_size,
condition_on_previous_text=False, language=lang.split("-")[0].lower())
segments, _ = self.engine.transcribe(
self.audiodata2array(audio),
beam_size=self.beam_size,
condition_on_previous_text=False,
language=lang.split("-")[0].lower(),
)
# segments is an iterator, transcription only happens here
transcription = "".join(segment.text for segment in segments).strip()
return transcription
Expand All @@ -221,28 +244,35 @@ def available_languages(self) -> set:


FasterWhisperSTTConfig = {
lang: [{"model": "tiny",
lang: [
{
"model": "tiny",
"lang": lang,
"meta": {
"priority": 50,
"display_name": f"FasterWhisper (Tiny)",
"offline": True}
"offline": True,
},
{"model": "base",
},
{
"model": "base",
"lang": lang,
"meta": {
"priority": 55,
"display_name": f"FasterWhisper (Base)",
"offline": True}
"offline": True,
},
{"model": "small",
},
{
"model": "small",
"lang": lang,
"meta": {
"priority": 60,
"display_name": f"FasterWhisper (Small)",
"offline": True}
}
]
"offline": True,
},
},
]
for lang, lang_name in FasterWhisperSTT.LANGUAGES.items()
}

Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ovos-plugin-manager>=0.0.23a10
requests
SpeechRecognition>=3.8.1
faster-whisper
faster-whisper>=0.10.0

0 comments on commit 6d3d5ac

Please sign in to comment.