diff --git a/3rdparty/voicevox/.gitignore b/3rdparty/voicevox/.gitignore
new file mode 100644
index 000000000..8cb8e60a3
--- /dev/null
+++ b/3rdparty/voicevox/.gitignore
@@ -0,0 +1,6 @@
+build
+dict
+lib
+node_scripts/voicevox_engine
+requirements.txt
+!.gitignore
diff --git a/3rdparty/voicevox/CMakeLists.txt b/3rdparty/voicevox/CMakeLists.txt
new file mode 100644
index 000000000..1e7297b5d
--- /dev/null
+++ b/3rdparty/voicevox/CMakeLists.txt
@@ -0,0 +1,71 @@
+cmake_minimum_required(VERSION 2.8.3)
+project(voicevox)
+
+find_package(catkin REQUIRED
+ COMPONENTS
+ catkin_virtualenv
+)
+
+set(INSTALL_DIR ${PROJECT_SOURCE_DIR})
+
+catkin_package()
+
+catkin_generate_virtualenv(
+ INPUT_REQUIREMENTS requirements.in
+ PYTHON_INTERPRETER python3
+ USE_SYSTEM_PACKAGES FALSE
+)
+
+add_custom_command(
+ OUTPUT voicevox_model_installed
+ COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.model
+ MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum
+ INSTALL_DIR=${INSTALL_DIR}
+)
+
+
+add_custom_command(
+ OUTPUT voicevox_core_installed
+ COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.core
+ MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum
+ INSTALL_DIR=${INSTALL_DIR}
+)
+
+add_custom_command(
+ OUTPUT voicevox_engine_installed
+ COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.engine
+ MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum
+ INSTALL_DIR=${INSTALL_DIR}
+)
+
+add_custom_command(
+ OUTPUT open_jtalk_dic_installed
+ COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.open_jtalk_dic
+ MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum
+ INSTALL_DIR=${INSTALL_DIR}
+)
+
+add_custom_target(all_installed ALL DEPENDS
+ voicevox_model_installed
+ voicevox_core_installed
+ voicevox_engine_installed
+ open_jtalk_dic_installed)
+
+file(GLOB NODE_SCRIPTS_FILES node_scripts/*.py)
+catkin_install_python(
+ PROGRAMS ${NODE_SCRIPTS_FILES}
+ DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}/node_scripts/
+)
+install(DIRECTORY node_scripts/voicevox_engine
+ DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/catkin_virtualenv_scripts/
+ USE_SOURCE_PERMISSIONS)
+install(DIRECTORY launch dict
+ DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+ USE_SOURCE_PERMISSIONS)
+install(PROGRAMS bin/text2wave
+ DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/bin)
+
+install(DIRECTORY
+ ${INSTALL_DIR}/lib
+ DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+ USE_SOURCE_PERMISSIONS)
diff --git a/3rdparty/voicevox/Makefile b/3rdparty/voicevox/Makefile
new file mode 100644
index 000000000..a2c90f3bb
--- /dev/null
+++ b/3rdparty/voicevox/Makefile
@@ -0,0 +1,11 @@
+all:
+ make -f Makefile.core
+ make -f Makefile.model
+ make -f Makefile.engine
+ make -f Makefile.open_jtalk_dic
+clean:
+ make -f Makefile.core clean
+ make -f Makefile.model clean
+ make -f Makefile.engine clean
+ make -f Makefile.open_jtalk_dic clean
+ rm -rf build
diff --git a/3rdparty/voicevox/Makefile.core b/3rdparty/voicevox/Makefile.core
new file mode 100644
index 000000000..bac21eb0f
--- /dev/null
+++ b/3rdparty/voicevox/Makefile.core
@@ -0,0 +1,28 @@
+# -*- makefile -*-
+
+all: installed.viocevox_core
+
+VERSION = 0.11.4
+FILENAME = core.zip
+TARBALL = build/$(FILENAME)
+TARBALL_URL = "https://github.com/VOICEVOX/voicevox_core/releases/download/$(VERSION)/core.zip"
+SOURCE_DIR = build/core
+UNPACK_CMD = unzip
+MD5SUM_DIR = $(CURDIR)/md5sum
+MD5SUM_FILE = $(MD5SUM_DIR)/$(FILENAME).md5sum
+SCRIPT_DIR = $( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+include $(shell rospack find mk)/download_unpack_build.mk
+INSTALL_DIR = './'
+
+
+installed.viocevox_core: $(SOURCE_DIR)/unpacked
+ mkdir -p $(INSTALL_DIR)/lib
+ cp build/core/lib*.so $(INSTALL_DIR)/lib/
+ cp build/core/*.bin $(INSTALL_DIR)/lib/
+ cp build/core/metas.json $(INSTALL_DIR)/lib/metas.json
+
+clean:
+ rm -rf $(TARBALL)
+ rm -rf $(SOURCE_DIR)
+ rm -rf $(INSTALL_DIR)/lib
+ rm -rf build
diff --git a/3rdparty/voicevox/Makefile.engine b/3rdparty/voicevox/Makefile.engine
new file mode 100644
index 000000000..b3d6899fa
--- /dev/null
+++ b/3rdparty/voicevox/Makefile.engine
@@ -0,0 +1,24 @@
+# -*- makefile -*-
+
+all: installed.voicevox_engine
+
+VERSION = 0.11.4
+FILENAME = $(VERSION).tar.gz
+TARBALL = build/$(FILENAME)
+TARBALL_URL = "https://github.com/VOICEVOX/voicevox_engine/archive/refs/tags/$(FILENAME)"
+SOURCE_DIR = build/voicevox_engine-$(VERSION)
+UNPACK_CMD = tar xvzf
+MD5SUM_DIR = $(CURDIR)/md5sum
+MD5SUM_FILE = $(MD5SUM_DIR)/voicevox_engine.tar.gz.md5sum
+include $(shell rospack find mk)/download_unpack_build.mk
+INSTALL_DIR = './'
+
+
+installed.voicevox_engine: $(SOURCE_DIR)/unpacked
+ cp -r build/voicevox_engine-$(VERSION) $(INSTALL_DIR)/node_scripts/voicevox_engine
+
+clean:
+ rm -rf $(TARBALL)
+ rm -rf $(SOURCE_DIR)
+ rm -rf $(INSTALL_DIR)/node_scripts/voicevox_engine
+ rm -rf build
diff --git a/3rdparty/voicevox/Makefile.model b/3rdparty/voicevox/Makefile.model
new file mode 100644
index 000000000..004028105
--- /dev/null
+++ b/3rdparty/voicevox/Makefile.model
@@ -0,0 +1,26 @@
+# -*- makefile -*-
+
+all: installed.voicevox_model
+
+VERSION = 1.10.0
+FILENAME = onnxruntime-linux-x64-$(VERSION).tgz
+TARBALL = build/$(FILENAME)
+TARBALL_URL = "https://github.com/microsoft/onnxruntime/releases/download/v$(VERSION)/$(FILENAME)"
+SOURCE_DIR = build/onnxruntime-linux-x64-$(VERSION)
+UNPACK_CMD = tar xvzf
+MD5SUM_DIR = $(CURDIR)/md5sum
+MD5SUM_FILE = $(MD5SUM_DIR)/$(FILENAME).md5sum
+SCRIPT_DIR = $( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+include $(shell rospack find mk)/download_unpack_build.mk
+INSTALL_DIR = './'
+
+
+installed.voicevox_model: $(SOURCE_DIR)/unpacked
+ mkdir -p $(INSTALL_DIR)/lib
+ cp build/onnxruntime-linux-x64-$(VERSION)/lib/* $(INSTALL_DIR)/lib
+
+clean:
+ rm -rf $(TARBALL)
+ rm -rf $(SOURCE_DIR)
+ rm -rf $(INSTALL_DIR)/lib
+ rm -rf build
diff --git a/3rdparty/voicevox/Makefile.open_jtalk_dic b/3rdparty/voicevox/Makefile.open_jtalk_dic
new file mode 100644
index 000000000..646921159
--- /dev/null
+++ b/3rdparty/voicevox/Makefile.open_jtalk_dic
@@ -0,0 +1,25 @@
+# -*- makefile -*-
+
+all: installed.open_jtalk_dic
+
+VERSION = 1.11.1
+FILENAME = open_jtalk_dic_utf_8-1.11.tar.gz
+TARBALL = build/$(FILENAME)
+TARBALL_URL = "https://github.com/r9y9/open_jtalk/releases/download/v$(VERSION)/$(FILENAME)"
+SOURCE_DIR = build/open_jtalk_dic_utf_8-1.11
+UNPACK_CMD = tar xvzf
+MD5SUM_DIR = $(CURDIR)/md5sum
+MD5SUM_FILE = $(MD5SUM_DIR)/open_jtalk_dic.tar.gz.md5sum
+include $(shell rospack find mk)/download_unpack_build.mk
+INSTALL_DIR = './'
+
+
+installed.open_jtalk_dic: $(SOURCE_DIR)/unpacked
+ mkdir -p $(INSTALL_DIR)/dict
+ cp -r build/open_jtalk_dic_utf_8-1.11 $(INSTALL_DIR)/dict
+
+clean:
+ rm -rf $(TARBALL)
+ rm -rf $(SOURCE_DIR)
+ rm -rf $(INSTALL_DIR)/dict/open_jtalk_dic_utf_8-1.11
+ rm -rf build
diff --git a/3rdparty/voicevox/README.md b/3rdparty/voicevox/README.md
new file mode 100644
index 000000000..d5602db71
--- /dev/null
+++ b/3rdparty/voicevox/README.md
@@ -0,0 +1,103 @@
+# voicevox
+
+ROS Interface for [VOICEVOX](https://voicevox.hiroshiba.jp/) (AI speech synthesis)
+
+## TERM
+
+[VOICEVOX](https://voicevox.hiroshiba.jp/) is basically free to use, but please check the terms of use below.
+
+[TERM](https://voicevox.hiroshiba.jp/term)
+
+Each voice synthesis character has its own rules. Please use this package according to those terms.
+
+| Character name | term link |
+| ---- | ---- |
+| 四国めたん | https://zunko.jp/con_ongen_kiyaku.html |
+| ずんだもん | https://zunko.jp/con_ongen_kiyaku.html |
+| 春日部つむぎ | https://tsukushinyoki10.wixsite.com/ktsumugiofficial/利用規約 |
+| 波音リツ | http://canon-voice.com/kiyaku.html |
+| 雨晴はう | https://amehau.com/?page_id=225 |
+| 玄野武宏 | https://virvoxproject.wixsite.com/official/voicevoxの利用規約 |
+| 白上虎太郎 | https://virvoxproject.wixsite.com/official/voicevoxの利用規約 |
+| 青山龍星 | https://virvoxproject.wixsite.com/official/voicevoxの利用規約 |
+| 冥鳴ひまり | https://kotoran8zunzun.wixsite.com/my-site/利用規約 |
+| 九州そら | https://zunko.jp/con_ongen_kiyaku.html |
+
+## Installation
+
+Build this package.
+
+```bash
+cd /path/to/catkin_workspace
+catkin build voicevox
+```
+
+## Usage
+
+### Launch sound_play with VOICEVOX Text-to-Speech
+
+```bash
+roslaunch voicevox voicevox_texttospeech.launch
+```
+
+
+### Say something
+
+#### For python users
+
+```python
+import rospy
+from sound_play.libsoundplay import SoundClient
+
+rospy.init_node('say_node')
+
+client = SoundClient(sound_action='robotsound_jp', sound_topic='robotsound_jp')
+
+client.say('こんにちは', voice='四国めたん-あまあま')
+```
+
+You can change the voice by changing the voice_name.
+You can also specify the speaker id.
+Look at the following tables for further details.
+
+| speaker_id | voice_name |
+| ---- | ---- |
+| 0 | 四国めたん-あまあま |
+| 1 | ずんだもん-あまあま |
+| 2 | 四国めたん-ノーマル |
+| 3 | ずんだもん-ノーマル |
+| 4 | 四国めたん-セクシー |
+| 5 | ずんだもん-セクシー |
+| 6 | 四国めたん-ツンツン |
+| 7 | ずんだもん-ツンツン |
+| 8 | 春日部つむぎ-ノーマル |
+| 9 | 波音リツ-ノーマル |
+| 10 | 雨晴はう-ノーマル |
+| 11 | 玄野武宏-ノーマル |
+| 12 | 白上虎太郎-ノーマル |
+| 13 | 青山龍星-ノーマル |
+| 14 | 冥鳴ひまり-ノーマル |
+| 15 | 九州そら-あまあま |
+| 16 | 九州そら-ノーマル |
+| 17 | 九州そら-セクシー |
+| 18 | 九州そら-ツンツン |
+| 19 | 九州そら-ささやき |
+
+#### For roseus users
+
+```
+$ roseus
+(load "package://pr2eus/speak.l")
+
+(ros::roseus "say_node")
+
+(speak "JSKへようこそ。" :lang "波音リツ" :wait t :topic-name "robotsound_jp")
+```
+
+### Tips
+
+Normally, the server for speech synthesis starts up at `http://localhost:50021`.
+You can change the url and port by setting values for `VOICEVOX_TEXTTOSPEECH_URL` and `VOICEVOX_TEXTTOSPEECH_PORT`.
+
+You can also set the default character by setting `VOICEVOX_DEFAULT_SPEAKER_ID`.
+Please refer to [here](#saysomething) for the speaker id.
diff --git a/3rdparty/voicevox/bin/text2wave b/3rdparty/voicevox/bin/text2wave
new file mode 100755
index 000000000..1e3fe7236
--- /dev/null
+++ b/3rdparty/voicevox/bin/text2wave
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import argparse
+import os
+
+import requests
+
+
+speaker_id_to_name = {
+ '0': '四国めたん-あまあま',
+ '1': 'ずんだもん-あまあま',
+ '2': '四国めたん-ノーマル',
+ '3': 'ずんだもん-ノーマル',
+ '4': '四国めたん-セクシー',
+ '5': 'ずんだもん-セクシー',
+ '6': '四国めたん-ツンツン',
+ '7': 'ずんだもん-ツンツン',
+ '8': '春日部つむぎ-ノーマル',
+ '9': '波音リツ-ノーマル',
+ '10': '雨晴はう-ノーマル',
+ '11': '玄野武宏-ノーマル',
+ '12': '白上虎太郎-ノーマル',
+ '13': '青山龍星-ノーマル',
+ '14': '冥鳴ひまり-ノーマル',
+ '15': '九州そら-あまあま',
+ '16': '九州そら-ノーマル',
+ '17': '九州そら-セクシー',
+ '18': '九州そら-ツンツン',
+ '19': '九州そら-ささやき',
+}
+
+name_to_speaker_id = {
+ b: a for a, b in speaker_id_to_name.items()
+}
+
+
+DEFAULT_SPEAKER_ID = os.environ.get(
+ 'VOICEVOX_DEFAULT_SPEAKER_ID', '2')
+if not DEFAULT_SPEAKER_ID.isdigit():
+ DEFAULT_SPEAKER_ID = name_to_speaker_id[DEFAULT_SPEAKER_ID]
+VOICEVOX_TEXTTOSPEECH_URL = os.environ.get(
+ 'VOICEVOX_TEXTTOSPEECH_URL', 'localhost')
+VOICEVOX_TEXTTOSPEECH_PORT = os.environ.get(
+ 'VOICEVOX_TEXTTOSPEECH_PORT', 50021)
+
+
+def determine_voice_name(voice_name):
+ if len(voice_name) == 0:
+ speaker_id = DEFAULT_SPEAKER_ID
+ else:
+ if voice_name.isdigit():
+ if voice_name in speaker_id_to_name:
+ speaker_id = voice_name
+ else:
+ print(
+ '[Text2Wave] Invalid speaker_id ({}). Use default voice.'
+ .format(speaker_id_to_name[DEFAULT_SPEAKER_ID]))
+ speaker_id = DEFAULT_SPEAKER_ID
+ else:
+ candidates = list(filter(
+ lambda name: name.startswith(voice_name),
+ name_to_speaker_id))
+ if candidates:
+ speaker_id = name_to_speaker_id[candidates[0]]
+ else:
+ print('[Text2Wave] Invalid voice_name ({}). Use default voice.'
+ .format(speaker_id_to_name[DEFAULT_SPEAKER_ID]))
+ speaker_id = DEFAULT_SPEAKER_ID
+ print('[Text2Wave] Speak using voice_name ({})..'.format(
+ speaker_id_to_name[speaker_id]))
+ return speaker_id
+
+
+def convert_to_str(x):
+ if isinstance(x, str):
+ pass
+ elif isinstance(x, bytes):
+ x = x.decode('utf-8')
+ else:
+ raise ValueError(
+ 'Invalid input x type: {}'
+ .format(type(x)))
+ return x
+
+
+def request_synthesis(
+ sentence, output_path, speaker_id='1'):
+ headers = {'accept': 'application/json'}
+
+ sentence = convert_to_str(sentence)
+ speaker_id = convert_to_str(speaker_id)
+ params = {
+ 'speaker': speaker_id,
+ 'text': sentence,
+ }
+ base_url = 'http://{}:{}'.format(
+ VOICEVOX_TEXTTOSPEECH_URL,
+ VOICEVOX_TEXTTOSPEECH_PORT)
+ url = '{}/audio_query'.format(base_url)
+ response = requests.post(url, headers=headers,
+ params=params)
+ data = response.json()
+ url = '{}/synthesis'.format(base_url)
+ response = requests.post(url, headers=headers,
+ params=params,
+ json=data)
+ with open(output_path, 'wb') as f:
+ f.write(response.content)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='')
+ parser.add_argument('-eval', '--evaluate')
+ parser.add_argument('-o', '--output')
+ parser.add_argument('text')
+ args = parser.parse_args()
+
+ with open(args.text, 'rb') as f:
+ speech_text = f.readline()
+
+ speaker_id = determine_voice_name(
+ args.evaluate.lstrip('(').rstrip(')'))
+ request_synthesis(speech_text,
+ args.output,
+ speaker_id)
diff --git a/3rdparty/voicevox/launch/voicevox_texttospeech.launch b/3rdparty/voicevox/launch/voicevox_texttospeech.launch
new file mode 100644
index 000000000..15de7c551
--- /dev/null
+++ b/3rdparty/voicevox/launch/voicevox_texttospeech.launch
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/3rdparty/voicevox/md5sum/core.zip.md5sum b/3rdparty/voicevox/md5sum/core.zip.md5sum
new file mode 100644
index 000000000..f5b5ac439
--- /dev/null
+++ b/3rdparty/voicevox/md5sum/core.zip.md5sum
@@ -0,0 +1 @@
+96149a074d8ee093039321a88e00076d core.zip
diff --git a/3rdparty/voicevox/md5sum/onnxruntime-linux-x64-1.10.0.tgz.md5sum b/3rdparty/voicevox/md5sum/onnxruntime-linux-x64-1.10.0.tgz.md5sum
new file mode 100644
index 000000000..817b68d89
--- /dev/null
+++ b/3rdparty/voicevox/md5sum/onnxruntime-linux-x64-1.10.0.tgz.md5sum
@@ -0,0 +1 @@
+9ca61e2009a16cf8a1e9ab9ad0655009 onnxruntime-linux-x64-1.10.0.tgz
diff --git a/3rdparty/voicevox/md5sum/open_jtalk_dic.tar.gz.md5sum b/3rdparty/voicevox/md5sum/open_jtalk_dic.tar.gz.md5sum
new file mode 100644
index 000000000..8ce4bb07b
--- /dev/null
+++ b/3rdparty/voicevox/md5sum/open_jtalk_dic.tar.gz.md5sum
@@ -0,0 +1 @@
+ba02dac4143492c3790f949be224dfdf open_jtalk_dic_utf_8-1.11.tar.gz
diff --git a/3rdparty/voicevox/md5sum/voicevox_engine.tar.gz.md5sum b/3rdparty/voicevox/md5sum/voicevox_engine.tar.gz.md5sum
new file mode 100644
index 000000000..5947e3633
--- /dev/null
+++ b/3rdparty/voicevox/md5sum/voicevox_engine.tar.gz.md5sum
@@ -0,0 +1 @@
+997bf9e915f7d6288c923ab1ff5f4ff6 0.11.4.tar.gz
diff --git a/3rdparty/voicevox/node_scripts/server.py b/3rdparty/voicevox/node_scripts/server.py
new file mode 100644
index 000000000..add596aff
--- /dev/null
+++ b/3rdparty/voicevox/node_scripts/server.py
@@ -0,0 +1,573 @@
+#!/usr/bin/env python3
+
+# This code was created based on the following link's code.
+# https://github.com/VOICEVOX/voicevox_engine/blob/0.11.4/run.py
+
+import base64
+from distutils.version import LooseVersion
+from functools import lru_cache
+import imp
+import json
+import multiprocessing
+import os
+import os.path as osp
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from tempfile import TemporaryFile
+from typing import Dict
+from typing import List
+from typing import Optional
+import zipfile
+
+from fastapi import FastAPI
+from fastapi import HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.params import Query
+from fastapi import Response
+import rospkg
+import rospy
+import soundfile
+from starlette.responses import FileResponse
+import uvicorn
+
+
+PKG_NAME = 'voicevox'
+abs_path = osp.dirname(osp.abspath(__file__))
+voicevox_engine = imp.load_package(
+ 'voicevox_engine', osp.join(abs_path, 'voicevox_engine/voicevox_engine'))
+rospack = rospkg.RosPack()
+voicevox_dir = rospack.get_path(PKG_NAME)
+voicevox_lib_dir = osp.join(voicevox_dir, 'lib')
+# set pyopenjtalk's dic.tar.gz file
+os.environ['OPEN_JTALK_DICT_DIR'] = osp.join(
+ voicevox_dir, 'dict', 'open_jtalk_dic_utf_8-1.11')
+
+
+from voicevox_engine import __version__
+from voicevox_engine.kana_parser import create_kana
+from voicevox_engine.kana_parser import parse_kana
+from voicevox_engine.model import AccentPhrase
+from voicevox_engine.model import AudioQuery
+from voicevox_engine.model import ParseKanaBadRequest
+from voicevox_engine.model import ParseKanaError
+from voicevox_engine.model import Speaker
+from voicevox_engine.model import SpeakerInfo
+from voicevox_engine.model import SupportedDevicesInfo
+from voicevox_engine.morphing import \
+ synthesis_morphing_parameter as _synthesis_morphing_parameter
+from voicevox_engine.morphing import synthesis_morphing
+from voicevox_engine.preset import Preset
+from voicevox_engine.preset import PresetLoader
+from voicevox_engine.synthesis_engine import make_synthesis_engines
+from voicevox_engine.synthesis_engine import SynthesisEngineBase
+from voicevox_engine.user_dict import user_dict_startup_processing
+from voicevox_engine.utility import connect_base64_waves
+from voicevox_engine.utility import ConnectBase64WavesException
+from voicevox_engine.utility import engine_root
+
+
+def b64encode_str(s):
+ return base64.b64encode(s).decode("utf-8")
+
+
+def generate_app(
+ synthesis_engines: Dict[str, SynthesisEngineBase], latest_core_version: str
+) -> FastAPI:
+ root_dir = engine_root()
+
+ default_sampling_rate = synthesis_engines[latest_core_version].default_sampling_rate
+
+ app = FastAPI(
+ title="VOICEVOX ENGINE",
+ description="VOICEVOXの音声合成エンジンです。",
+ version=__version__,
+ )
+
+ app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+ )
+
+ preset_loader = PresetLoader(
+ preset_path=root_dir / "presets.yaml",
+ )
+
+ # キャッシュを有効化
+ # モジュール側でlru_cacheを指定するとキャッシュを制御しにくいため、HTTPサーバ側で指定する
+ # TODO: キャッシュを管理するモジュール側API・HTTP側APIを用意する
+ synthesis_morphing_parameter = lru_cache(maxsize=4)(_synthesis_morphing_parameter)
+
+ # @app.on_event("startup")
+ # async def start_catch_disconnection():
+ # if args.enable_cancellable_synthesis:
+ # loop = asyncio.get_event_loop()
+ # _ = loop.create_task(cancellable_engine.catch_disconnection())
+
+ @app.on_event("startup")
+ def apply_user_dict():
+ user_dict_startup_processing()
+
+ def get_engine(core_version: Optional[str]) -> SynthesisEngineBase:
+ if core_version is None:
+ return synthesis_engines[latest_core_version]
+ if core_version in synthesis_engines:
+ return synthesis_engines[core_version]
+ raise HTTPException(status_code=422, detail="不明なバージョンです")
+
+ @app.post(
+ "/audio_query",
+ response_model=AudioQuery,
+ tags=["クエリ作成"],
+ summary="音声合成用のクエリを作成する",
+ )
+ def audio_query(text: str, speaker: int, core_version: Optional[str] = None):
+ """
+ クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。
+ """
+ engine = get_engine(core_version)
+ accent_phrases = engine.create_accent_phrases(text, speaker_id=speaker)
+ return AudioQuery(
+ accent_phrases=accent_phrases,
+ speedScale=1,
+ pitchScale=0,
+ intonationScale=1,
+ volumeScale=1,
+ prePhonemeLength=0.1,
+ postPhonemeLength=0.1,
+ outputSamplingRate=default_sampling_rate,
+ outputStereo=False,
+ kana=create_kana(accent_phrases),
+ )
+
+ @app.post(
+ "/audio_query_from_preset",
+ response_model=AudioQuery,
+ tags=["クエリ作成"],
+ summary="音声合成用のクエリをプリセットを用いて作成する",
+ )
+ def audio_query_from_preset(
+ text: str, preset_id: int, core_version: Optional[str] = None
+ ):
+ """
+ クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。
+ """
+ engine = get_engine(core_version)
+ presets, err_detail = preset_loader.load_presets()
+ if err_detail:
+ raise HTTPException(status_code=422, detail=err_detail)
+ for preset in presets:
+ if preset.id == preset_id:
+ selected_preset = preset
+ break
+ else:
+ raise HTTPException(status_code=422, detail="該当するプリセットIDが見つかりません")
+
+ accent_phrases = engine.create_accent_phrases(
+ text, speaker_id=selected_preset.style_id
+ )
+ return AudioQuery(
+ accent_phrases=accent_phrases,
+ speedScale=selected_preset.speedScale,
+ pitchScale=selected_preset.pitchScale,
+ intonationScale=selected_preset.intonationScale,
+ volumeScale=selected_preset.volumeScale,
+ prePhonemeLength=selected_preset.prePhonemeLength,
+ postPhonemeLength=selected_preset.postPhonemeLength,
+ outputSamplingRate=default_sampling_rate,
+ outputStereo=False,
+ kana=create_kana(accent_phrases),
+ )
+
+ @app.post(
+ "/accent_phrases",
+ response_model=List[AccentPhrase],
+ tags=["クエリ編集"],
+ summary="テキストからアクセント句を得る",
+ responses={
+ 400: {
+ "description": "読み仮名のパースに失敗",
+ "model": ParseKanaBadRequest,
+ }
+ },
+ )
+ def accent_phrases(
+ text: str,
+ speaker: int,
+ is_kana: bool = False,
+ core_version: Optional[str] = None,
+ ):
+ """
+ テキストからアクセント句を得ます。
+ is_kanaが`true`のとき、テキストは次のようなAquesTalkライクな記法に従う読み仮名として処理されます。デフォルトは`false`です。
+ * 全てのカナはカタカナで記述される
+ * アクセント句は`/`または`、`で区切る。`、`で区切った場合に限り無音区間が挿入される。
+ * カナの手前に`_`を入れるとそのカナは無声化される
+ * アクセント位置を`'`で指定する。全てのアクセント句にはアクセント位置を1つ指定する必要がある。
+ * アクセント句末に`?`(全角)を入れることにより疑問文の発音ができる。
+ """
+ engine = get_engine(core_version)
+ if is_kana:
+ try:
+ accent_phrases = parse_kana(text)
+ except ParseKanaError as err:
+ raise HTTPException(
+ status_code=400,
+ detail=ParseKanaBadRequest(err).dict(),
+ )
+ accent_phrases = engine.replace_mora_data(
+ accent_phrases=accent_phrases, speaker_id=speaker
+ )
+
+ return accent_phrases
+ else:
+ return engine.create_accent_phrases(text, speaker_id=speaker)
+
+ @app.post(
+ "/mora_data",
+ response_model=List[AccentPhrase],
+ tags=["クエリ編集"],
+ summary="アクセント句から音高・音素長を得る",
+ )
+ def mora_data(
+ accent_phrases: List[AccentPhrase],
+ speaker: int,
+ core_version: Optional[str] = None,
+ ):
+ engine = get_engine(core_version)
+ return engine.replace_mora_data(accent_phrases, speaker_id=speaker)
+
+ @app.post(
+ "/mora_length",
+ response_model=List[AccentPhrase],
+ tags=["クエリ編集"],
+ summary="アクセント句から音素長を得る",
+ )
+ def mora_length(
+ accent_phrases: List[AccentPhrase],
+ speaker: int,
+ core_version: Optional[str] = None,
+ ):
+ engine = get_engine(core_version)
+ return engine.replace_phoneme_length(
+ accent_phrases=accent_phrases, speaker_id=speaker
+ )
+
+ @app.post(
+ "/mora_pitch",
+ response_model=List[AccentPhrase],
+ tags=["クエリ編集"],
+ summary="アクセント句から音高を得る",
+ )
+ def mora_pitch(
+ accent_phrases: List[AccentPhrase],
+ speaker: int,
+ core_version: Optional[str] = None,
+ ):
+ engine = get_engine(core_version)
+ return engine.replace_mora_pitch(
+ accent_phrases=accent_phrases, speaker_id=speaker
+ )
+
+ @app.post(
+ "/synthesis",
+ response_class=FileResponse,
+ responses={
+ 200: {
+ "content": {
+ "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+ },
+ }
+ },
+ tags=["音声合成"],
+ summary="音声合成する",
+ )
+ def synthesis(
+ query: AudioQuery,
+ speaker: int,
+ enable_interrogative_upspeak: bool = Query( # noqa: B008
+ default=True,
+ description="疑問系のテキストが与えられたら語尾を自動調整する",
+ ),
+ core_version: Optional[str] = None,
+ ):
+ engine = get_engine(core_version)
+ wave = engine.synthesis(
+ query=query,
+ speaker_id=speaker,
+ enable_interrogative_upspeak=enable_interrogative_upspeak,
+ )
+
+ with NamedTemporaryFile(delete=False) as f:
+ soundfile.write(
+ file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"
+ )
+
+ return FileResponse(f.name, media_type="audio/wav")
+
+ @app.post(
+ "/multi_synthesis",
+ response_class=FileResponse,
+ responses={
+ 200: {
+ "content": {
+ "application/zip": {
+ "schema": {"type": "string", "format": "binary"}
+ }
+ },
+ }
+ },
+ tags=["音声合成"],
+ summary="複数まとめて音声合成する",
+ )
+ def multi_synthesis(
+ queries: List[AudioQuery],
+ speaker: int,
+ core_version: Optional[str] = None,
+ ):
+ engine = get_engine(core_version)
+ sampling_rate = queries[0].outputSamplingRate
+
+ with NamedTemporaryFile(delete=False) as f:
+
+ with zipfile.ZipFile(f, mode="a") as zip_file:
+
+ for i in range(len(queries)):
+
+ if queries[i].outputSamplingRate != sampling_rate:
+ raise HTTPException(
+ status_code=422, detail="サンプリングレートが異なるクエリがあります"
+ )
+
+ with TemporaryFile() as wav_file:
+
+ wave = engine.synthesis(query=queries[i], speaker_id=speaker)
+ soundfile.write(
+ file=wav_file,
+ data=wave,
+ samplerate=sampling_rate,
+ format="WAV",
+ )
+ wav_file.seek(0)
+ zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())
+
+ return FileResponse(f.name, media_type="application/zip")
+
+ @app.post(
+ "/synthesis_morphing",
+ response_class=FileResponse,
+ responses={
+ 200: {
+ "content": {
+ "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+ },
+ }
+ },
+ tags=["音声合成"],
+ summary="2人の話者でモーフィングした音声を合成する",
+ )
+ def _synthesis_morphing(
+ query: AudioQuery,
+ base_speaker: int,
+ target_speaker: int,
+ morph_rate: float = Query(..., ge=0.0, le=1.0), # noqa: B008
+ core_version: Optional[str] = None,
+ ):
+ """
+ 指定された2人の話者で音声を合成、指定した割合でモーフィングした音声を得ます。
+ モーフィングの割合は`morph_rate`で指定でき、0.0でベースの話者、1.0でターゲットの話者に近づきます。
+ """
+ engine = get_engine(core_version)
+
+ # 生成したパラメータはキャッシュされる
+ morph_param = synthesis_morphing_parameter(
+ engine=engine,
+ query=query,
+ base_speaker=base_speaker,
+ target_speaker=target_speaker,
+ )
+
+ morph_wave = synthesis_morphing(
+ morph_param=morph_param,
+ morph_rate=morph_rate,
+ output_stereo=query.outputStereo,
+ )
+
+ with NamedTemporaryFile(delete=False) as f:
+ soundfile.write(
+ file=f,
+ data=morph_wave,
+ samplerate=morph_param.fs,
+ format="WAV",
+ )
+
+ return FileResponse(f.name, media_type="audio/wav")
+
+ @app.post(
+ "/connect_waves",
+ response_class=FileResponse,
+ responses={
+ 200: {
+ "content": {
+ "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+ },
+ }
+ },
+ tags=["その他"],
+ summary="base64エンコードされた複数のwavデータを一つに結合する",
+ )
+ def connect_waves(waves: List[str]):
+ """
+ base64エンコードされたwavデータを一纏めにし、wavファイルで返します。
+ """
+ try:
+ waves_nparray, sampling_rate = connect_base64_waves(waves)
+ except ConnectBase64WavesException as err:
+ return HTTPException(status_code=422, detail=str(err))
+
+ with NamedTemporaryFile(delete=False) as f:
+ soundfile.write(
+ file=f,
+ data=waves_nparray,
+ samplerate=sampling_rate,
+ format="WAV",
+ )
+
+ return FileResponse(f.name, media_type="audio/wav")
+
+ @app.get("/presets", response_model=List[Preset], tags=["その他"])
+ def get_presets():
+ """
+ エンジンが保持しているプリセットの設定を返します
+
+ Returns
+ -------
+ presets: List[Preset]
+ プリセットのリスト
+ """
+ presets, err_detail = preset_loader.load_presets()
+ if err_detail:
+ raise HTTPException(status_code=422, detail=err_detail)
+ return presets
+
+ @app.get("/version", tags=["その他"])
+ def version() -> str:
+ return __version__
+
+ @app.get("/core_versions", response_model=List[str], tags=["その他"])
+ def core_versions() -> List[str]:
+ return Response(
+ content=json.dumps(list(synthesis_engines.keys())),
+ media_type="application/json",
+ )
+
+ @app.get("/speakers", response_model=List[Speaker], tags=["その他"])
+ def speakers(
+ core_version: Optional[str] = None,
+ ):
+ engine = get_engine(core_version)
+ return Response(
+ content=engine.speakers,
+ media_type="application/json",
+ )
+
+ @app.get("/speaker_info", response_model=SpeakerInfo, tags=["その他"])
+ def speaker_info(speaker_uuid: str, core_version: Optional[str] = None):
+ """
+ 指定されたspeaker_uuidに関する情報をjson形式で返します。
+ 画像や音声はbase64エンコードされたものが返されます。
+
+ Returns
+ -------
+ ret_data: SpeakerInfo
+ """
+ speakers = json.loads(get_engine(core_version).speakers)
+ for i in range(len(speakers)):
+ if speakers[i]["speaker_uuid"] == speaker_uuid:
+ speaker = speakers[i]
+ break
+ else:
+ raise HTTPException(status_code=404, detail="該当する話者が見つかりません")
+
+ try:
+ policy = (root_dir / f"speaker_info/{speaker_uuid}/policy.md").read_text(
+ "utf-8"
+ )
+ portrait = b64encode_str(
+ (root_dir / f"speaker_info/{speaker_uuid}/portrait.png").read_bytes()
+ )
+ style_infos = []
+ for style in speaker["styles"]:
+ id = style["id"]
+ icon = b64encode_str(
+ (
+ root_dir / f"speaker_info/{speaker_uuid}/icons/{id}.png"
+ ).read_bytes()
+ )
+ voice_samples = [
+ b64encode_str(
+ (
+ root_dir
+ / "speaker_info/{}/voice_samples/{}_{}.wav".format(
+ speaker_uuid, id, str(j + 1).zfill(3)
+ )
+ ).read_bytes()
+ )
+ for j in range(3)
+ ]
+ style_infos.append(
+ {"id": id, "icon": icon, "voice_samples": voice_samples}
+ )
+ except FileNotFoundError:
+ import traceback
+
+ traceback.print_exc()
+ raise HTTPException(status_code=500, detail="追加情報が見つかりませんでした")
+
+ ret_data = {"policy": policy, "portrait": portrait, "style_infos": style_infos}
+ return ret_data
+
+ @app.get("/supported_devices", response_model=SupportedDevicesInfo, tags=["その他"])
+ def supported_devices(
+ core_version: Optional[str] = None,
+ ):
+ supported_devices = get_engine(core_version).supported_devices
+ if supported_devices is None:
+ raise HTTPException(status_code=422, detail="非対応の機能です。")
+ return Response(
+ content=supported_devices,
+ media_type="application/json",
+ )
+
+ return app
+
+
+if __name__ == "__main__":
+ multiprocessing.freeze_support()
+ rospy.init_node('voicevox_server')
+
+ voicelib_dir = [Path(voicevox_lib_dir)]
+ use_gpu = False
+ host = rospy.get_param('~host', "127.0.0.1")
+ port = rospy.get_param('~port', 50021)
+ cpu_num_threads = rospy.get_param('~cpu_num_threads', None)
+ if cpu_num_threads is None:
+ cpu_num_threads = multiprocessing.cpu_count()
+
+ synthesis_engines = make_synthesis_engines(
+ use_gpu=use_gpu,
+ voicelib_dirs=voicelib_dir,
+ cpu_num_threads=cpu_num_threads,
+ )
+ if len(synthesis_engines) == 0:
+ rospy.logerr("音声合成エンジンがありません。")
+ latest_core_version = str(max([LooseVersion(ver)
+ for ver in synthesis_engines]))
+
+ uvicorn.run(
+ generate_app(synthesis_engines, latest_core_version),
+ host=host,
+ port=port,
+ )
diff --git a/3rdparty/voicevox/package.xml b/3rdparty/voicevox/package.xml
new file mode 100644
index 000000000..5240c3468
--- /dev/null
+++ b/3rdparty/voicevox/package.xml
@@ -0,0 +1,36 @@
+
+
+
+ voicevox
+ 0.0.1
+ VOICEVOX: AI speech synthesis
+ Iori Yanokura
+
+ MIT
+
+ http://ros.org/wiki/voicevox
+
+ Iori Yanokura
+
+ catkin
+ catkin_virtualenv
+
+ mk
+ roslib
+ rospack
+ unzip
+ wget
+
+ python3
+ python3-requests
+ sound_play
+ unzip
+ wget
+
+
+ requirements.txt
+
+
+
diff --git a/3rdparty/voicevox/requirements.in b/3rdparty/voicevox/requirements.in
new file mode 100644
index 000000000..c9cfd223a
--- /dev/null
+++ b/3rdparty/voicevox/requirements.in
@@ -0,0 +1,11 @@
+PyYAML
+aiofiles
+appdirs
+fastapi
+git+https://github.com/VOICEVOX/pyopenjtalk@a85521a0a0f298f08d9e9b24987b3c77eb4aaff5#egg=pyopenjtalk
+numpy
+python-multipart
+pyworld
+scipy
+soundfile
+uvicorn