diff --git a/ros_speech_recognition/CMakeLists.txt b/ros_speech_recognition/CMakeLists.txt
index 722faba02..10b7ba223 100644
--- a/ros_speech_recognition/CMakeLists.txt
+++ b/ros_speech_recognition/CMakeLists.txt
@@ -12,17 +12,14 @@ generate_dynamic_reconfigure_options(
cfg/SpeechRecognition.cfg
)
+add_custom_target(${PROJECT_NAME}_install_trained_data ALL COMMAND python$ENV{ROS_PYTHON_VERSION} ${PROJECT_SOURCE_DIR}/scripts/install_trained_data.py)
+
catkin_package()
-if($ENV{ROS_DISTRO} STRGREATER "melodic")
- catkin_generate_virtualenv(
- PYTHON_INTERPRETER python3
- )
-else()
- catkin_generate_virtualenv(
- PYTHON_INTERPRETER python2
+catkin_generate_virtualenv(
+ PYTHON_INTERPRETER python3
+ CHECK_VENV FALSE
)
-endif()
file(GLOB PYTHON_SCRIPT_FILES scripts/*.py test/*.py)
catkin_install_python(
diff --git a/ros_speech_recognition/README.md b/ros_speech_recognition/README.md
index f47685ab5..fdd6418c0 100644
--- a/ros_speech_recognition/README.md
+++ b/ros_speech_recognition/README.md
@@ -220,6 +220,14 @@ roslaunch ros_speech_recognition parrotry.launch language:=ja-JP
Auth key for Bing API.
This is valid only if `~engine` is `bing`.
+
+* `~vosk_model_path` (`String`, default: `None`)
+
+ Path to trainded model for Vosk API.
+ This is valid only if `~engine` is `Vosk`.
+
+ If `en-US` or `ja` is selected as `~language`, you do not need to specify the path.
+ To load other models, please download them from [Model list](https://alphacephei.com/vosk/models).
## Author
diff --git a/ros_speech_recognition/cfg/SpeechRecognition.cfg b/ros_speech_recognition/cfg/SpeechRecognition.cfg
index 4894ec416..6157caddc 100755
--- a/ros_speech_recognition/cfg/SpeechRecognition.cfg
+++ b/ros_speech_recognition/cfg/SpeechRecognition.cfg
@@ -10,7 +10,8 @@ engine_enum = gen.enum([gen.const("Google", str_t, "Google", "Google Speech Reco
gen.const("Wit", str_t, "Wit", "Wit.ai API"),
gen.const("Bing", str_t, "Bing", "Microsoft Bing Speech API"),
gen.const("Houndify", str_t, "Houndify", "Houndify API"),
- gen.const("IBM", str_t, "IBM", "IBM Speech to Text API")],
+ gen.const("IBM", str_t, "IBM", "IBM Speech to Text API"),
+ gen.const("Vosk", str_t, "Vosk", "Vosk API")],
"engine")
# name type level description default min max
diff --git a/ros_speech_recognition/package.xml b/ros_speech_recognition/package.xml
index 2a0ccbeb5..2a8168e77 100644
--- a/ros_speech_recognition/package.xml
+++ b/ros_speech_recognition/package.xml
@@ -13,6 +13,7 @@
catkin_virtualenv
dynamic_reconfigure
+ jsk_data
speech_recognition_msgs
g++-static
@@ -20,6 +21,7 @@
audio_common_msgs
dynamic_reconfigure
flac
+ jsk_data
sound_play
speech_recognition_msgs
ubuntu-sounds
diff --git a/ros_speech_recognition/requirements.txt b/ros_speech_recognition/requirements.txt
index 7362afdbf..43041b622 100644
--- a/ros_speech_recognition/requirements.txt
+++ b/ros_speech_recognition/requirements.txt
@@ -1 +1,2 @@
-SpeechRecognition==3.8.1
+SpeechRecognition==3.9.0
+vosk==0.3.45
diff --git a/ros_speech_recognition/scripts/install_trained_data.py b/ros_speech_recognition/scripts/install_trained_data.py
new file mode 100644
index 000000000..ff4236e0a
--- /dev/null
+++ b/ros_speech_recognition/scripts/install_trained_data.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+import argparse
+import multiprocessing
+
+import jsk_data
+
+
+def download_data(*args, **kwargs):
+ p = multiprocessing.Process(
+ target=jsk_data.download_data,
+ args=args,
+ kwargs=kwargs)
+ p.start()
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-v', '--verbose', dest='quiet', action='store_false')
+ args = parser.parse_args()
+ quiet = args.quiet
+
+ PKG = 'ros_speech_recognition'
+
+ download_data(
+ pkg_name=PKG,
+ path='trained_data/vosk-model-small-ja-0.22.zip',
+ url='https://alphacephei.com/vosk/models/vosk-model-small-ja-0.22.zip', # NOQA
+ md5='0e3163dd62dfb0d823353718ac3cbf79',
+ extract=True,
+ quiet=quiet,
+ )
+
+ download_data(
+ pkg_name=PKG,
+ path='trained_data/vosk-model-small-en-us-0.15.zip',
+ url='https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip', # NOQA
+ md5='09ab50ccd62b674cbaa231b825f9c1cb',
+ extract=True,
+ quiet=quiet,
+ )
+
+if __name__ == '__main__':
+ main()
diff --git a/ros_speech_recognition/scripts/speech_recognition_node.py b/ros_speech_recognition/scripts/speech_recognition_node.py
index 70fd98b1b..a0d98fef9 100644
--- a/ros_speech_recognition/scripts/speech_recognition_node.py
+++ b/ros_speech_recognition/scripts/speech_recognition_node.py
@@ -6,6 +6,7 @@
import rospy
import speech_recognition as SR
from ros_speech_recognition.recognize_google_cloud import RecognizerEx
+import ros_speech_recognition.recognize_vosk
import json
import array
import sys
@@ -275,6 +276,10 @@ def recognize(self, audio):
recog_func = self.recognizer.recognize_houndify
elif self.engine == Config.SpeechRecognition_IBM:
recog_func = self.recognizer.recognize_ibm
+ elif self.engine == Config.SpeechRecognition_Vosk:
+ if not self.args:
+ self.args = {'model_path': rospy.get_param('~vosk_model_path', None)}
+ recog_func = self.recognizer.recognize_vosk
return recog_func(audio_data=audio, language=self.language, **self.args)
diff --git a/ros_speech_recognition/src/ros_speech_recognition/recognize_vosk.py b/ros_speech_recognition/src/ros_speech_recognition/recognize_vosk.py
new file mode 100644
index 000000000..73592478f
--- /dev/null
+++ b/ros_speech_recognition/src/ros_speech_recognition/recognize_vosk.py
@@ -0,0 +1,37 @@
+# file to override recognize_vosk
+# we need this to use vosk model anywhere
+
+from speech_recognition import AudioData
+from ros_speech_recognition.recognize_google_cloud import RecognizerEx
+from vosk import Model, KaldiRecognizer
+import json
+import os.path as osp
+import rospkg
+import rospy
+
+def recognize_vosk(self, audio_data, model_path=None, language='en-US'):
+
+ assert isinstance(audio_data, AudioData), "Data must be audio data"
+
+ if not hasattr(self, 'vosk_model'):
+ if model_path is None:
+ PKG = 'ros_speech_recognition'
+ rp = rospkg.RosPack()
+ data_path = osp.join(rp.get_path(PKG), 'trained_data')
+ if language == 'en-US':
+ model_path = osp.join(data_path, 'vosk-model-small-en-us-0.15')
+ elif language == 'ja':
+ model_path = osp.join(data_path, 'vosk-model-small-ja-0.22')
+ else:
+ rospy.logerr("Unsupported language: {0}.\n Please download the model from https://alphacephei.com/vosk/models and specify its path as 'vosk_model_path'.".format(language))
+ exit (1)
+ rospy.loginfo("Loading model from {}".format(model_path))
+ self.vosk_model = Model(model_path)
+ rec = KaldiRecognizer(self.vosk_model, 16000);
+
+ rec.AcceptWaveform(audio_data.get_raw_data(convert_rate=16000, convert_width=2));
+ finalRecognition = rec.FinalResult()
+ text = json.loads(finalRecognition)['text']
+ return text
+
+RecognizerEx.recognize_vosk = recognize_vosk
diff --git a/ros_speech_recognition/trained_data/.gitignore b/ros_speech_recognition/trained_data/.gitignore
new file mode 100644
index 000000000..c96a04f00
--- /dev/null
+++ b/ros_speech_recognition/trained_data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
\ No newline at end of file