diff --git a/ros_speech_recognition/CMakeLists.txt b/ros_speech_recognition/CMakeLists.txt index 722faba02..10b7ba223 100644 --- a/ros_speech_recognition/CMakeLists.txt +++ b/ros_speech_recognition/CMakeLists.txt @@ -12,17 +12,14 @@ generate_dynamic_reconfigure_options( cfg/SpeechRecognition.cfg ) +add_custom_target(${PROJECT_NAME}_install_trained_data ALL COMMAND python$ENV{ROS_PYTHON_VERSION} ${PROJECT_SOURCE_DIR}/scripts/install_trained_data.py) + catkin_package() -if($ENV{ROS_DISTRO} STRGREATER "melodic") - catkin_generate_virtualenv( - PYTHON_INTERPRETER python3 - ) -else() - catkin_generate_virtualenv( - PYTHON_INTERPRETER python2 +catkin_generate_virtualenv( + PYTHON_INTERPRETER python3 + CHECK_VENV FALSE ) -endif() file(GLOB PYTHON_SCRIPT_FILES scripts/*.py test/*.py) catkin_install_python( diff --git a/ros_speech_recognition/README.md b/ros_speech_recognition/README.md index f47685ab5..fdd6418c0 100644 --- a/ros_speech_recognition/README.md +++ b/ros_speech_recognition/README.md @@ -220,6 +220,14 @@ roslaunch ros_speech_recognition parrotry.launch language:=ja-JP Auth key for Bing API. This is valid only if `~engine` is `bing`. + +* `~vosk_model_path` (`String`, default: `None`) + + Path to trainded model for Vosk API. + This is valid only if `~engine` is `Vosk`. + + If `en-US` or `ja` is selected as `~language`, you do not need to specify the path. + To load other models, please download them from [Model list](https://alphacephei.com/vosk/models). ## Author diff --git a/ros_speech_recognition/cfg/SpeechRecognition.cfg b/ros_speech_recognition/cfg/SpeechRecognition.cfg index 4894ec416..6157caddc 100755 --- a/ros_speech_recognition/cfg/SpeechRecognition.cfg +++ b/ros_speech_recognition/cfg/SpeechRecognition.cfg @@ -10,7 +10,8 @@ engine_enum = gen.enum([gen.const("Google", str_t, "Google", "Google Speech Reco gen.const("Wit", str_t, "Wit", "Wit.ai API"), gen.const("Bing", str_t, "Bing", "Microsoft Bing Speech API"), gen.const("Houndify", str_t, "Houndify", "Houndify API"), - gen.const("IBM", str_t, "IBM", "IBM Speech to Text API")], + gen.const("IBM", str_t, "IBM", "IBM Speech to Text API"), + gen.const("Vosk", str_t, "Vosk", "Vosk API")], "engine") # name type level description default min max diff --git a/ros_speech_recognition/package.xml b/ros_speech_recognition/package.xml index 2a0ccbeb5..2a8168e77 100644 --- a/ros_speech_recognition/package.xml +++ b/ros_speech_recognition/package.xml @@ -13,6 +13,7 @@ catkin_virtualenv dynamic_reconfigure + jsk_data speech_recognition_msgs g++-static @@ -20,6 +21,7 @@ audio_common_msgs dynamic_reconfigure flac + jsk_data sound_play speech_recognition_msgs ubuntu-sounds diff --git a/ros_speech_recognition/requirements.txt b/ros_speech_recognition/requirements.txt index 7362afdbf..43041b622 100644 --- a/ros_speech_recognition/requirements.txt +++ b/ros_speech_recognition/requirements.txt @@ -1 +1,2 @@ -SpeechRecognition==3.8.1 +SpeechRecognition==3.9.0 +vosk==0.3.45 diff --git a/ros_speech_recognition/scripts/install_trained_data.py b/ros_speech_recognition/scripts/install_trained_data.py new file mode 100644 index 000000000..ff4236e0a --- /dev/null +++ b/ros_speech_recognition/scripts/install_trained_data.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +import argparse +import multiprocessing + +import jsk_data + + +def download_data(*args, **kwargs): + p = multiprocessing.Process( + target=jsk_data.download_data, + args=args, + kwargs=kwargs) + p.start() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-v', '--verbose', dest='quiet', action='store_false') + args = parser.parse_args() + quiet = args.quiet + + PKG = 'ros_speech_recognition' + + download_data( + pkg_name=PKG, + path='trained_data/vosk-model-small-ja-0.22.zip', + url='https://alphacephei.com/vosk/models/vosk-model-small-ja-0.22.zip', # NOQA + md5='0e3163dd62dfb0d823353718ac3cbf79', + extract=True, + quiet=quiet, + ) + + download_data( + pkg_name=PKG, + path='trained_data/vosk-model-small-en-us-0.15.zip', + url='https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip', # NOQA + md5='09ab50ccd62b674cbaa231b825f9c1cb', + extract=True, + quiet=quiet, + ) + +if __name__ == '__main__': + main() diff --git a/ros_speech_recognition/scripts/speech_recognition_node.py b/ros_speech_recognition/scripts/speech_recognition_node.py index 70fd98b1b..a0d98fef9 100644 --- a/ros_speech_recognition/scripts/speech_recognition_node.py +++ b/ros_speech_recognition/scripts/speech_recognition_node.py @@ -6,6 +6,7 @@ import rospy import speech_recognition as SR from ros_speech_recognition.recognize_google_cloud import RecognizerEx +import ros_speech_recognition.recognize_vosk import json import array import sys @@ -275,6 +276,10 @@ def recognize(self, audio): recog_func = self.recognizer.recognize_houndify elif self.engine == Config.SpeechRecognition_IBM: recog_func = self.recognizer.recognize_ibm + elif self.engine == Config.SpeechRecognition_Vosk: + if not self.args: + self.args = {'model_path': rospy.get_param('~vosk_model_path', None)} + recog_func = self.recognizer.recognize_vosk return recog_func(audio_data=audio, language=self.language, **self.args) diff --git a/ros_speech_recognition/src/ros_speech_recognition/recognize_vosk.py b/ros_speech_recognition/src/ros_speech_recognition/recognize_vosk.py new file mode 100644 index 000000000..73592478f --- /dev/null +++ b/ros_speech_recognition/src/ros_speech_recognition/recognize_vosk.py @@ -0,0 +1,37 @@ +# file to override recognize_vosk +# we need this to use vosk model anywhere + +from speech_recognition import AudioData +from ros_speech_recognition.recognize_google_cloud import RecognizerEx +from vosk import Model, KaldiRecognizer +import json +import os.path as osp +import rospkg +import rospy + +def recognize_vosk(self, audio_data, model_path=None, language='en-US'): + + assert isinstance(audio_data, AudioData), "Data must be audio data" + + if not hasattr(self, 'vosk_model'): + if model_path is None: + PKG = 'ros_speech_recognition' + rp = rospkg.RosPack() + data_path = osp.join(rp.get_path(PKG), 'trained_data') + if language == 'en-US': + model_path = osp.join(data_path, 'vosk-model-small-en-us-0.15') + elif language == 'ja': + model_path = osp.join(data_path, 'vosk-model-small-ja-0.22') + else: + rospy.logerr("Unsupported language: {0}.\n Please download the model from https://alphacephei.com/vosk/models and specify its path as 'vosk_model_path'.".format(language)) + exit (1) + rospy.loginfo("Loading model from {}".format(model_path)) + self.vosk_model = Model(model_path) + rec = KaldiRecognizer(self.vosk_model, 16000); + + rec.AcceptWaveform(audio_data.get_raw_data(convert_rate=16000, convert_width=2)); + finalRecognition = rec.FinalResult() + text = json.loads(finalRecognition)['text'] + return text + +RecognizerEx.recognize_vosk = recognize_vosk diff --git a/ros_speech_recognition/trained_data/.gitignore b/ros_speech_recognition/trained_data/.gitignore new file mode 100644 index 000000000..c96a04f00 --- /dev/null +++ b/ros_speech_recognition/trained_data/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file