diff --git a/3rdparty/voicevox/.gitignore b/3rdparty/voicevox/.gitignore
new file mode 100644
index 000000000..8cb8e60a3
--- /dev/null
+++ b/3rdparty/voicevox/.gitignore
@@ -0,0 +1,6 @@
+build
+dict
+lib
+node_scripts/voicevox_engine
+requirements.txt
+!.gitignore
diff --git a/3rdparty/voicevox/CMakeLists.txt b/3rdparty/voicevox/CMakeLists.txt
new file mode 100644
index 000000000..1e7297b5d
--- /dev/null
+++ b/3rdparty/voicevox/CMakeLists.txt
@@ -0,0 +1,71 @@
+cmake_minimum_required(VERSION 2.8.3)
+project(voicevox)
+
+find_package(catkin REQUIRED
+  COMPONENTS
+  catkin_virtualenv
+)
+
+set(INSTALL_DIR ${PROJECT_SOURCE_DIR})
+
+catkin_package()
+
+catkin_generate_virtualenv(
+  INPUT_REQUIREMENTS requirements.in
+  PYTHON_INTERPRETER python3
+  USE_SYSTEM_PACKAGES FALSE
+)
+
+add_custom_command(
+  OUTPUT voicevox_model_installed
+  COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.model
+  MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum
+  INSTALL_DIR=${INSTALL_DIR}
+)
+
+
+add_custom_command(
+  OUTPUT voicevox_core_installed
+  COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.core
+  MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum
+  INSTALL_DIR=${INSTALL_DIR}
+)
+
+add_custom_command(
+  OUTPUT voicevox_engine_installed
+  COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.engine
+  MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum
+  INSTALL_DIR=${INSTALL_DIR}
+)
+
+add_custom_command(
+  OUTPUT open_jtalk_dic_installed
+  COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.open_jtalk_dic
+  MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum
+  INSTALL_DIR=${INSTALL_DIR}
+)
+
+add_custom_target(all_installed ALL DEPENDS
+  voicevox_model_installed
+  voicevox_core_installed
+  voicevox_engine_installed
+  open_jtalk_dic_installed)
+
+file(GLOB NODE_SCRIPTS_FILES node_scripts/*.py)
+catkin_install_python(
+  PROGRAMS ${NODE_SCRIPTS_FILES}
+  DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}/node_scripts/
+)
+install(DIRECTORY node_scripts/voicevox_engine
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/catkin_virtualenv_scripts/
+  USE_SOURCE_PERMISSIONS)
+install(DIRECTORY launch dict
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+  USE_SOURCE_PERMISSIONS)
+install(PROGRAMS bin/text2wave
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/bin)
+
+install(DIRECTORY
+  ${INSTALL_DIR}/lib
+  DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}
+  USE_SOURCE_PERMISSIONS)
diff --git a/3rdparty/voicevox/Makefile b/3rdparty/voicevox/Makefile
new file mode 100644
index 000000000..a2c90f3bb
--- /dev/null
+++ b/3rdparty/voicevox/Makefile
@@ -0,0 +1,11 @@
+all:
+	make -f Makefile.core
+	make -f Makefile.model
+	make -f Makefile.engine
+	make -f Makefile.open_jtalk_dic
+clean:
+	make -f Makefile.core clean
+	make -f Makefile.model clean
+	make -f Makefile.engine clean
+	make -f Makefile.open_jtalk_dic clean
+	rm -rf build
diff --git a/3rdparty/voicevox/Makefile.core b/3rdparty/voicevox/Makefile.core
new file mode 100644
index 000000000..bac21eb0f
--- /dev/null
+++ b/3rdparty/voicevox/Makefile.core
@@ -0,0 +1,28 @@
+# -*- makefile -*-
+
+all: installed.viocevox_core
+
+VERSION = 0.11.4
+FILENAME = core.zip
+TARBALL = build/$(FILENAME)
+TARBALL_URL = "https://github.com/VOICEVOX/voicevox_core/releases/download/$(VERSION)/core.zip"
+SOURCE_DIR = build/core
+UNPACK_CMD = unzip
+MD5SUM_DIR = $(CURDIR)/md5sum
+MD5SUM_FILE = $(MD5SUM_DIR)/$(FILENAME).md5sum
+SCRIPT_DIR = $( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+include $(shell rospack find mk)/download_unpack_build.mk
+INSTALL_DIR =	'./'
+
+
+installed.viocevox_core: $(SOURCE_DIR)/unpacked
+	mkdir -p $(INSTALL_DIR)/lib
+	cp build/core/lib*.so $(INSTALL_DIR)/lib/
+	cp build/core/*.bin $(INSTALL_DIR)/lib/
+	cp build/core/metas.json $(INSTALL_DIR)/lib/metas.json
+
+clean:
+	rm -rf $(TARBALL)
+	rm -rf $(SOURCE_DIR)
+	rm -rf $(INSTALL_DIR)/lib
+	rm -rf build
diff --git a/3rdparty/voicevox/Makefile.engine b/3rdparty/voicevox/Makefile.engine
new file mode 100644
index 000000000..b3d6899fa
--- /dev/null
+++ b/3rdparty/voicevox/Makefile.engine
@@ -0,0 +1,24 @@
+# -*- makefile -*-
+
+all: installed.voicevox_engine
+
+VERSION = 0.11.4
+FILENAME = $(VERSION).tar.gz
+TARBALL = build/$(FILENAME)
+TARBALL_URL = "https://github.com/VOICEVOX/voicevox_engine/archive/refs/tags/$(FILENAME)"
+SOURCE_DIR = build/voicevox_engine-$(VERSION)
+UNPACK_CMD = tar xvzf
+MD5SUM_DIR = $(CURDIR)/md5sum
+MD5SUM_FILE = $(MD5SUM_DIR)/voicevox_engine.tar.gz.md5sum
+include $(shell rospack find mk)/download_unpack_build.mk
+INSTALL_DIR =	'./'
+
+
+installed.voicevox_engine: $(SOURCE_DIR)/unpacked
+	cp -r build/voicevox_engine-$(VERSION) $(INSTALL_DIR)/node_scripts/voicevox_engine
+
+clean:
+	rm -rf $(TARBALL)
+	rm -rf $(SOURCE_DIR)
+	rm -rf $(INSTALL_DIR)/node_scripts/voicevox_engine
+	rm -rf build
diff --git a/3rdparty/voicevox/Makefile.model b/3rdparty/voicevox/Makefile.model
new file mode 100644
index 000000000..004028105
--- /dev/null
+++ b/3rdparty/voicevox/Makefile.model
@@ -0,0 +1,26 @@
+# -*- makefile -*-
+
+all: installed.voicevox_model
+
+VERSION = 1.10.0
+FILENAME = onnxruntime-linux-x64-$(VERSION).tgz
+TARBALL = build/$(FILENAME)
+TARBALL_URL = "https://github.com/microsoft/onnxruntime/releases/download/v$(VERSION)/$(FILENAME)"
+SOURCE_DIR = build/onnxruntime-linux-x64-$(VERSION)
+UNPACK_CMD = tar xvzf
+MD5SUM_DIR = $(CURDIR)/md5sum
+MD5SUM_FILE = $(MD5SUM_DIR)/$(FILENAME).md5sum
+SCRIPT_DIR = $( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+include $(shell rospack find mk)/download_unpack_build.mk
+INSTALL_DIR =	'./'
+
+
+installed.voicevox_model: $(SOURCE_DIR)/unpacked
+	mkdir -p $(INSTALL_DIR)/lib
+	cp build/onnxruntime-linux-x64-$(VERSION)/lib/* $(INSTALL_DIR)/lib
+
+clean:
+	rm -rf $(TARBALL)
+	rm -rf $(SOURCE_DIR)
+	rm -rf $(INSTALL_DIR)/lib
+	rm -rf build
diff --git a/3rdparty/voicevox/Makefile.open_jtalk_dic b/3rdparty/voicevox/Makefile.open_jtalk_dic
new file mode 100644
index 000000000..646921159
--- /dev/null
+++ b/3rdparty/voicevox/Makefile.open_jtalk_dic
@@ -0,0 +1,25 @@
+# -*- makefile -*-
+
+all: installed.open_jtalk_dic
+
+VERSION = 1.11.1
+FILENAME = open_jtalk_dic_utf_8-1.11.tar.gz
+TARBALL = build/$(FILENAME)
+TARBALL_URL = "https://github.com/r9y9/open_jtalk/releases/download/v$(VERSION)/$(FILENAME)"
+SOURCE_DIR = build/open_jtalk_dic_utf_8-1.11
+UNPACK_CMD = tar xvzf
+MD5SUM_DIR = $(CURDIR)/md5sum
+MD5SUM_FILE = $(MD5SUM_DIR)/open_jtalk_dic.tar.gz.md5sum
+include $(shell rospack find mk)/download_unpack_build.mk
+INSTALL_DIR =	'./'
+
+
+installed.open_jtalk_dic: $(SOURCE_DIR)/unpacked
+	mkdir -p $(INSTALL_DIR)/dict
+	cp -r build/open_jtalk_dic_utf_8-1.11 $(INSTALL_DIR)/dict
+
+clean:
+	rm -rf $(TARBALL)
+	rm -rf $(SOURCE_DIR)
+	rm -rf $(INSTALL_DIR)/dict/open_jtalk_dic_utf_8-1.11
+	rm -rf build
diff --git a/3rdparty/voicevox/README.md b/3rdparty/voicevox/README.md
new file mode 100644
index 000000000..d5602db71
--- /dev/null
+++ b/3rdparty/voicevox/README.md
@@ -0,0 +1,103 @@
+# voicevox
+
+ROS Interface for [VOICEVOX](https://voicevox.hiroshiba.jp/) (AI speech synthesis)
+
+## TERM
+
+[VOICEVOX](https://voicevox.hiroshiba.jp/) is basically free to use, but please check the terms of use below.
+
+[TERM](https://voicevox.hiroshiba.jp/term)
+
+Each voice synthesis character has its own rules. Please use this package according to those terms.
+
+| Character name  |  term link |
+| ---- | ---- |
+| 四国めたん | https://zunko.jp/con_ongen_kiyaku.html |
+| ずんだもん | https://zunko.jp/con_ongen_kiyaku.html |
+| 春日部つむぎ | https://tsukushinyoki10.wixsite.com/ktsumugiofficial/利用規約 |
+| 波音リツ | http://canon-voice.com/kiyaku.html |
+| 雨晴はう | https://amehau.com/?page_id=225 |
+| 玄野武宏 | https://virvoxproject.wixsite.com/official/voicevoxの利用規約 |
+| 白上虎太郎 | https://virvoxproject.wixsite.com/official/voicevoxの利用規約 |
+| 青山龍星 | https://virvoxproject.wixsite.com/official/voicevoxの利用規約 |
+| 冥鳴ひまり | https://kotoran8zunzun.wixsite.com/my-site/利用規約 |
+| 九州そら | https://zunko.jp/con_ongen_kiyaku.html |
+
+## Installation
+
+Build this package.
+
+```bash
+cd /path/to/catkin_workspace
+catkin build voicevox
+```
+
+## Usage
+
+### Launch sound_play with VOICEVOX Text-to-Speech
+
+```bash
+roslaunch voicevox voicevox_texttospeech.launch
+```
+
+<a id="saysomething"></a>
+### Say something
+
+#### For python users
+
+```python
+import rospy
+from sound_play.libsoundplay import SoundClient
+
+rospy.init_node('say_node')
+
+client = SoundClient(sound_action='robotsound_jp', sound_topic='robotsound_jp')
+
+client.say('こんにちは', voice='四国めたん-あまあま')
+```
+
+You can change the voice by changing the voice_name.
+You can also specify the speaker id.
+Look at the following tables for further details.
+
+|  speaker_id  |  voice_name  |
+| ---- | ---- |
+| 0 | 四国めたん-あまあま |
+| 1 | ずんだもん-あまあま |
+| 2 | 四国めたん-ノーマル |
+| 3 | ずんだもん-ノーマル |
+| 4 | 四国めたん-セクシー |
+| 5 | ずんだもん-セクシー |
+| 6 | 四国めたん-ツンツン |
+| 7 | ずんだもん-ツンツン |
+| 8 | 春日部つむぎ-ノーマル |
+| 9 | 波音リツ-ノーマル |
+| 10 | 雨晴はう-ノーマル |
+| 11 | 玄野武宏-ノーマル |
+| 12 | 白上虎太郎-ノーマル |
+| 13 | 青山龍星-ノーマル |
+| 14 | 冥鳴ひまり-ノーマル |
+| 15 | 九州そら-あまあま |
+| 16 | 九州そら-ノーマル |
+| 17 | 九州そら-セクシー |
+| 18 | 九州そら-ツンツン |
+| 19 | 九州そら-ささやき |
+
+#### For roseus users
+
+```
+$ roseus
+(load "package://pr2eus/speak.l")
+
+(ros::roseus "say_node")
+
+(speak "JSKへようこそ。" :lang "波音リツ" :wait t :topic-name "robotsound_jp")
+```
+
+### Tips
+
+Normally, the server for speech synthesis starts up at `http://localhost:50021`.
+You can change the url and port by setting values for `VOICEVOX_TEXTTOSPEECH_URL` and `VOICEVOX_TEXTTOSPEECH_PORT`.
+
+You can also set the default character by setting `VOICEVOX_DEFAULT_SPEAKER_ID`.
+Please refer to [here](#saysomething) for the speaker id.
diff --git a/3rdparty/voicevox/bin/text2wave b/3rdparty/voicevox/bin/text2wave
new file mode 100755
index 000000000..1e3fe7236
--- /dev/null
+++ b/3rdparty/voicevox/bin/text2wave
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import argparse
+import os
+
+import requests
+
+
+speaker_id_to_name = {
+    '0': '四国めたん-あまあま',
+    '1': 'ずんだもん-あまあま',
+    '2': '四国めたん-ノーマル',
+    '3': 'ずんだもん-ノーマル',
+    '4': '四国めたん-セクシー',
+    '5': 'ずんだもん-セクシー',
+    '6': '四国めたん-ツンツン',
+    '7': 'ずんだもん-ツンツン',
+    '8': '春日部つむぎ-ノーマル',
+    '9': '波音リツ-ノーマル',
+    '10': '雨晴はう-ノーマル',
+    '11': '玄野武宏-ノーマル',
+    '12': '白上虎太郎-ノーマル',
+    '13': '青山龍星-ノーマル',
+    '14': '冥鳴ひまり-ノーマル',
+    '15': '九州そら-あまあま',
+    '16': '九州そら-ノーマル',
+    '17': '九州そら-セクシー',
+    '18': '九州そら-ツンツン',
+    '19': '九州そら-ささやき',
+}
+
+name_to_speaker_id = {
+    b: a for a, b in speaker_id_to_name.items()
+}
+
+
+DEFAULT_SPEAKER_ID = os.environ.get(
+    'VOICEVOX_DEFAULT_SPEAKER_ID', '2')
+if not DEFAULT_SPEAKER_ID.isdigit():
+    DEFAULT_SPEAKER_ID = name_to_speaker_id[DEFAULT_SPEAKER_ID]
+VOICEVOX_TEXTTOSPEECH_URL = os.environ.get(
+    'VOICEVOX_TEXTTOSPEECH_URL', 'localhost')
+VOICEVOX_TEXTTOSPEECH_PORT = os.environ.get(
+    'VOICEVOX_TEXTTOSPEECH_PORT', 50021)
+
+
+def determine_voice_name(voice_name):
+    if len(voice_name) == 0:
+        speaker_id = DEFAULT_SPEAKER_ID
+    else:
+        if voice_name.isdigit():
+            if voice_name in speaker_id_to_name:
+                speaker_id = voice_name
+            else:
+                print(
+                    '[Text2Wave] Invalid speaker_id ({}). Use default voice.'
+                    .format(speaker_id_to_name[DEFAULT_SPEAKER_ID]))
+                speaker_id = DEFAULT_SPEAKER_ID
+        else:
+            candidates = list(filter(
+                lambda name: name.startswith(voice_name),
+                name_to_speaker_id))
+            if candidates:
+                speaker_id = name_to_speaker_id[candidates[0]]
+            else:
+                print('[Text2Wave] Invalid voice_name ({}). Use default voice.'
+                      .format(speaker_id_to_name[DEFAULT_SPEAKER_ID]))
+                speaker_id = DEFAULT_SPEAKER_ID
+    print('[Text2Wave] Speak using voice_name ({})..'.format(
+        speaker_id_to_name[speaker_id]))
+    return speaker_id
+
+
+def convert_to_str(x):
+    if isinstance(x, str):
+        pass
+    elif isinstance(x, bytes):
+        x = x.decode('utf-8')
+    else:
+        raise ValueError(
+            'Invalid input x type: {}'
+            .format(type(x)))
+    return x
+
+
+def request_synthesis(
+        sentence, output_path, speaker_id='1'):
+    headers = {'accept': 'application/json'}
+
+    sentence = convert_to_str(sentence)
+    speaker_id = convert_to_str(speaker_id)
+    params = {
+        'speaker': speaker_id,
+        'text': sentence,
+    }
+    base_url = 'http://{}:{}'.format(
+        VOICEVOX_TEXTTOSPEECH_URL,
+        VOICEVOX_TEXTTOSPEECH_PORT)
+    url = '{}/audio_query'.format(base_url)
+    response = requests.post(url, headers=headers,
+                             params=params)
+    data = response.json()
+    url = '{}/synthesis'.format(base_url)
+    response = requests.post(url, headers=headers,
+                             params=params,
+                             json=data)
+    with open(output_path, 'wb') as f:
+        f.write(response.content)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('-eval', '--evaluate')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('text')
+    args = parser.parse_args()
+
+    with open(args.text, 'rb') as f:
+        speech_text = f.readline()
+
+    speaker_id = determine_voice_name(
+        args.evaluate.lstrip('(').rstrip(')'))
+    request_synthesis(speech_text,
+                      args.output,
+                      speaker_id)
diff --git a/3rdparty/voicevox/launch/voicevox_texttospeech.launch b/3rdparty/voicevox/launch/voicevox_texttospeech.launch
new file mode 100644
index 000000000..15de7c551
--- /dev/null
+++ b/3rdparty/voicevox/launch/voicevox_texttospeech.launch
@@ -0,0 +1,25 @@
+<launch>
+
+  <arg name="device" default="" />
+  <arg name="sound_play_respawn" default="true"
+       doc="Respawn sound_play node or not (default: true)" />
+  <arg name="default_speaker" default="2"
+       doc="Default speaker for VOICEVOX" />
+
+  <node name="voicevox_server"
+        pkg="voicevox" type="server.py"
+        respawn="$(arg sound_play_respawn)"
+        output="screen" >
+  </node>
+
+  <node name="sound_play_jp"
+        pkg="sound_play" type="soundplay_node.py"
+        respawn="$(arg sound_play_respawn)"
+        output="screen" >
+    <remap from="robotsound" to="robotsound_jp"/>
+    <remap from="sound_play" to="robotsound_jp"/>
+    <env name="VOICEVOX_DEFAULT_SPEAKER_ID" value="$(arg default_speaker)" />
+    <env name="PATH" value="$(find voicevox)/bin:$(env PATH)" />
+    <env name="PYTHONIOENCODING" value="utf-8" />
+  </node>
+</launch>
diff --git a/3rdparty/voicevox/md5sum/core.zip.md5sum b/3rdparty/voicevox/md5sum/core.zip.md5sum
new file mode 100644
index 000000000..f5b5ac439
--- /dev/null
+++ b/3rdparty/voicevox/md5sum/core.zip.md5sum
@@ -0,0 +1 @@
+96149a074d8ee093039321a88e00076d  core.zip
diff --git a/3rdparty/voicevox/md5sum/onnxruntime-linux-x64-1.10.0.tgz.md5sum b/3rdparty/voicevox/md5sum/onnxruntime-linux-x64-1.10.0.tgz.md5sum
new file mode 100644
index 000000000..817b68d89
--- /dev/null
+++ b/3rdparty/voicevox/md5sum/onnxruntime-linux-x64-1.10.0.tgz.md5sum
@@ -0,0 +1 @@
+9ca61e2009a16cf8a1e9ab9ad0655009  onnxruntime-linux-x64-1.10.0.tgz
diff --git a/3rdparty/voicevox/md5sum/open_jtalk_dic.tar.gz.md5sum b/3rdparty/voicevox/md5sum/open_jtalk_dic.tar.gz.md5sum
new file mode 100644
index 000000000..8ce4bb07b
--- /dev/null
+++ b/3rdparty/voicevox/md5sum/open_jtalk_dic.tar.gz.md5sum
@@ -0,0 +1 @@
+ba02dac4143492c3790f949be224dfdf  open_jtalk_dic_utf_8-1.11.tar.gz
diff --git a/3rdparty/voicevox/md5sum/voicevox_engine.tar.gz.md5sum b/3rdparty/voicevox/md5sum/voicevox_engine.tar.gz.md5sum
new file mode 100644
index 000000000..5947e3633
--- /dev/null
+++ b/3rdparty/voicevox/md5sum/voicevox_engine.tar.gz.md5sum
@@ -0,0 +1 @@
+997bf9e915f7d6288c923ab1ff5f4ff6  0.11.4.tar.gz
diff --git a/3rdparty/voicevox/node_scripts/server.py b/3rdparty/voicevox/node_scripts/server.py
new file mode 100644
index 000000000..add596aff
--- /dev/null
+++ b/3rdparty/voicevox/node_scripts/server.py
@@ -0,0 +1,573 @@
+#!/usr/bin/env python3
+
+# This code was created based on the following link's code.
+# https://github.com/VOICEVOX/voicevox_engine/blob/0.11.4/run.py
+
+import base64
+from distutils.version import LooseVersion
+from functools import lru_cache
+import imp
+import json
+import multiprocessing
+import os
+import os.path as osp
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from tempfile import TemporaryFile
+from typing import Dict
+from typing import List
+from typing import Optional
+import zipfile
+
+from fastapi import FastAPI
+from fastapi import HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.params import Query
+from fastapi import Response
+import rospkg
+import rospy
+import soundfile
+from starlette.responses import FileResponse
+import uvicorn
+
+
+PKG_NAME = 'voicevox'
+abs_path = osp.dirname(osp.abspath(__file__))
+voicevox_engine = imp.load_package(
+    'voicevox_engine', osp.join(abs_path, 'voicevox_engine/voicevox_engine'))
+rospack = rospkg.RosPack()
+voicevox_dir = rospack.get_path(PKG_NAME)
+voicevox_lib_dir = osp.join(voicevox_dir, 'lib')
+# set pyopenjtalk's dic.tar.gz file
+os.environ['OPEN_JTALK_DICT_DIR'] = osp.join(
+    voicevox_dir, 'dict', 'open_jtalk_dic_utf_8-1.11')
+
+
+from voicevox_engine import __version__
+from voicevox_engine.kana_parser import create_kana
+from voicevox_engine.kana_parser import parse_kana
+from voicevox_engine.model import AccentPhrase
+from voicevox_engine.model import AudioQuery
+from voicevox_engine.model import ParseKanaBadRequest
+from voicevox_engine.model import ParseKanaError
+from voicevox_engine.model import Speaker
+from voicevox_engine.model import SpeakerInfo
+from voicevox_engine.model import SupportedDevicesInfo
+from voicevox_engine.morphing import \
+    synthesis_morphing_parameter as _synthesis_morphing_parameter
+from voicevox_engine.morphing import synthesis_morphing
+from voicevox_engine.preset import Preset
+from voicevox_engine.preset import PresetLoader
+from voicevox_engine.synthesis_engine import make_synthesis_engines
+from voicevox_engine.synthesis_engine import SynthesisEngineBase
+from voicevox_engine.user_dict import user_dict_startup_processing
+from voicevox_engine.utility import connect_base64_waves
+from voicevox_engine.utility import ConnectBase64WavesException
+from voicevox_engine.utility import engine_root
+
+
+def b64encode_str(s):
+    return base64.b64encode(s).decode("utf-8")
+
+
+def generate_app(
+    synthesis_engines: Dict[str, SynthesisEngineBase], latest_core_version: str
+) -> FastAPI:
+    root_dir = engine_root()
+
+    default_sampling_rate = synthesis_engines[latest_core_version].default_sampling_rate
+
+    app = FastAPI(
+        title="VOICEVOX ENGINE",
+        description="VOICEVOXの音声合成エンジンです。",
+        version=__version__,
+    )
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    preset_loader = PresetLoader(
+        preset_path=root_dir / "presets.yaml",
+    )
+
+    # キャッシュを有効化
+    # モジュール側でlru_cacheを指定するとキャッシュを制御しにくいため、HTTPサーバ側で指定する
+    # TODO: キャッシュを管理するモジュール側API・HTTP側APIを用意する
+    synthesis_morphing_parameter = lru_cache(maxsize=4)(_synthesis_morphing_parameter)
+
+    # @app.on_event("startup")
+    # async def start_catch_disconnection():
+    #     if args.enable_cancellable_synthesis:
+    #         loop = asyncio.get_event_loop()
+    #         _ = loop.create_task(cancellable_engine.catch_disconnection())
+
+    @app.on_event("startup")
+    def apply_user_dict():
+        user_dict_startup_processing()
+
+    def get_engine(core_version: Optional[str]) -> SynthesisEngineBase:
+        if core_version is None:
+            return synthesis_engines[latest_core_version]
+        if core_version in synthesis_engines:
+            return synthesis_engines[core_version]
+        raise HTTPException(status_code=422, detail="不明なバージョンです")
+
+    @app.post(
+        "/audio_query",
+        response_model=AudioQuery,
+        tags=["クエリ作成"],
+        summary="音声合成用のクエリを作成する",
+    )
+    def audio_query(text: str, speaker: int, core_version: Optional[str] = None):
+        """
+        クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。
+        """
+        engine = get_engine(core_version)
+        accent_phrases = engine.create_accent_phrases(text, speaker_id=speaker)
+        return AudioQuery(
+            accent_phrases=accent_phrases,
+            speedScale=1,
+            pitchScale=0,
+            intonationScale=1,
+            volumeScale=1,
+            prePhonemeLength=0.1,
+            postPhonemeLength=0.1,
+            outputSamplingRate=default_sampling_rate,
+            outputStereo=False,
+            kana=create_kana(accent_phrases),
+        )
+
+    @app.post(
+        "/audio_query_from_preset",
+        response_model=AudioQuery,
+        tags=["クエリ作成"],
+        summary="音声合成用のクエリをプリセットを用いて作成する",
+    )
+    def audio_query_from_preset(
+        text: str, preset_id: int, core_version: Optional[str] = None
+    ):
+        """
+        クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。
+        """
+        engine = get_engine(core_version)
+        presets, err_detail = preset_loader.load_presets()
+        if err_detail:
+            raise HTTPException(status_code=422, detail=err_detail)
+        for preset in presets:
+            if preset.id == preset_id:
+                selected_preset = preset
+                break
+        else:
+            raise HTTPException(status_code=422, detail="該当するプリセットIDが見つかりません")
+
+        accent_phrases = engine.create_accent_phrases(
+            text, speaker_id=selected_preset.style_id
+        )
+        return AudioQuery(
+            accent_phrases=accent_phrases,
+            speedScale=selected_preset.speedScale,
+            pitchScale=selected_preset.pitchScale,
+            intonationScale=selected_preset.intonationScale,
+            volumeScale=selected_preset.volumeScale,
+            prePhonemeLength=selected_preset.prePhonemeLength,
+            postPhonemeLength=selected_preset.postPhonemeLength,
+            outputSamplingRate=default_sampling_rate,
+            outputStereo=False,
+            kana=create_kana(accent_phrases),
+        )
+
+    @app.post(
+        "/accent_phrases",
+        response_model=List[AccentPhrase],
+        tags=["クエリ編集"],
+        summary="テキストからアクセント句を得る",
+        responses={
+            400: {
+                "description": "読み仮名のパースに失敗",
+                "model": ParseKanaBadRequest,
+            }
+        },
+    )
+    def accent_phrases(
+        text: str,
+        speaker: int,
+        is_kana: bool = False,
+        core_version: Optional[str] = None,
+    ):
+        """
+        テキストからアクセント句を得ます。
+        is_kanaが`true`のとき、テキストは次のようなAquesTalkライクな記法に従う読み仮名として処理されます。デフォルトは`false`です。
+        * 全てのカナはカタカナで記述される
+        * アクセント句は`/`または`、`で区切る。`、`で区切った場合に限り無音区間が挿入される。
+        * カナの手前に`_`を入れるとそのカナは無声化される
+        * アクセント位置を`'`で指定する。全てのアクセント句にはアクセント位置を1つ指定する必要がある。
+        * アクセント句末に`？`(全角)を入れることにより疑問文の発音ができる。
+        """
+        engine = get_engine(core_version)
+        if is_kana:
+            try:
+                accent_phrases = parse_kana(text)
+            except ParseKanaError as err:
+                raise HTTPException(
+                    status_code=400,
+                    detail=ParseKanaBadRequest(err).dict(),
+                )
+            accent_phrases = engine.replace_mora_data(
+                accent_phrases=accent_phrases, speaker_id=speaker
+            )
+
+            return accent_phrases
+        else:
+            return engine.create_accent_phrases(text, speaker_id=speaker)
+
+    @app.post(
+        "/mora_data",
+        response_model=List[AccentPhrase],
+        tags=["クエリ編集"],
+        summary="アクセント句から音高・音素長を得る",
+    )
+    def mora_data(
+        accent_phrases: List[AccentPhrase],
+        speaker: int,
+        core_version: Optional[str] = None,
+    ):
+        engine = get_engine(core_version)
+        return engine.replace_mora_data(accent_phrases, speaker_id=speaker)
+
+    @app.post(
+        "/mora_length",
+        response_model=List[AccentPhrase],
+        tags=["クエリ編集"],
+        summary="アクセント句から音素長を得る",
+    )
+    def mora_length(
+        accent_phrases: List[AccentPhrase],
+        speaker: int,
+        core_version: Optional[str] = None,
+    ):
+        engine = get_engine(core_version)
+        return engine.replace_phoneme_length(
+            accent_phrases=accent_phrases, speaker_id=speaker
+        )
+
+    @app.post(
+        "/mora_pitch",
+        response_model=List[AccentPhrase],
+        tags=["クエリ編集"],
+        summary="アクセント句から音高を得る",
+    )
+    def mora_pitch(
+        accent_phrases: List[AccentPhrase],
+        speaker: int,
+        core_version: Optional[str] = None,
+    ):
+        engine = get_engine(core_version)
+        return engine.replace_mora_pitch(
+            accent_phrases=accent_phrases, speaker_id=speaker
+        )
+
+    @app.post(
+        "/synthesis",
+        response_class=FileResponse,
+        responses={
+            200: {
+                "content": {
+                    "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+                },
+            }
+        },
+        tags=["音声合成"],
+        summary="音声合成する",
+    )
+    def synthesis(
+        query: AudioQuery,
+        speaker: int,
+        enable_interrogative_upspeak: bool = Query(  # noqa: B008
+            default=True,
+            description="疑問系のテキストが与えられたら語尾を自動調整する",
+        ),
+        core_version: Optional[str] = None,
+    ):
+        engine = get_engine(core_version)
+        wave = engine.synthesis(
+            query=query,
+            speaker_id=speaker,
+            enable_interrogative_upspeak=enable_interrogative_upspeak,
+        )
+
+        with NamedTemporaryFile(delete=False) as f:
+            soundfile.write(
+                file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV"
+            )
+
+        return FileResponse(f.name, media_type="audio/wav")
+
+    @app.post(
+        "/multi_synthesis",
+        response_class=FileResponse,
+        responses={
+            200: {
+                "content": {
+                    "application/zip": {
+                        "schema": {"type": "string", "format": "binary"}
+                    }
+                },
+            }
+        },
+        tags=["音声合成"],
+        summary="複数まとめて音声合成する",
+    )
+    def multi_synthesis(
+        queries: List[AudioQuery],
+        speaker: int,
+        core_version: Optional[str] = None,
+    ):
+        engine = get_engine(core_version)
+        sampling_rate = queries[0].outputSamplingRate
+
+        with NamedTemporaryFile(delete=False) as f:
+
+            with zipfile.ZipFile(f, mode="a") as zip_file:
+
+                for i in range(len(queries)):
+
+                    if queries[i].outputSamplingRate != sampling_rate:
+                        raise HTTPException(
+                            status_code=422, detail="サンプリングレートが異なるクエリがあります"
+                        )
+
+                    with TemporaryFile() as wav_file:
+
+                        wave = engine.synthesis(query=queries[i], speaker_id=speaker)
+                        soundfile.write(
+                            file=wav_file,
+                            data=wave,
+                            samplerate=sampling_rate,
+                            format="WAV",
+                        )
+                        wav_file.seek(0)
+                        zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read())
+
+        return FileResponse(f.name, media_type="application/zip")
+
+    @app.post(
+        "/synthesis_morphing",
+        response_class=FileResponse,
+        responses={
+            200: {
+                "content": {
+                    "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+                },
+            }
+        },
+        tags=["音声合成"],
+        summary="2人の話者でモーフィングした音声を合成する",
+    )
+    def _synthesis_morphing(
+        query: AudioQuery,
+        base_speaker: int,
+        target_speaker: int,
+        morph_rate: float = Query(..., ge=0.0, le=1.0),  # noqa: B008
+        core_version: Optional[str] = None,
+    ):
+        """
+        指定された2人の話者で音声を合成、指定した割合でモーフィングした音声を得ます。
+        モーフィングの割合は`morph_rate`で指定でき、0.0でベースの話者、1.0でターゲットの話者に近づきます。
+        """
+        engine = get_engine(core_version)
+
+        # 生成したパラメータはキャッシュされる
+        morph_param = synthesis_morphing_parameter(
+            engine=engine,
+            query=query,
+            base_speaker=base_speaker,
+            target_speaker=target_speaker,
+        )
+
+        morph_wave = synthesis_morphing(
+            morph_param=morph_param,
+            morph_rate=morph_rate,
+            output_stereo=query.outputStereo,
+        )
+
+        with NamedTemporaryFile(delete=False) as f:
+            soundfile.write(
+                file=f,
+                data=morph_wave,
+                samplerate=morph_param.fs,
+                format="WAV",
+            )
+
+        return FileResponse(f.name, media_type="audio/wav")
+
+    @app.post(
+        "/connect_waves",
+        response_class=FileResponse,
+        responses={
+            200: {
+                "content": {
+                    "audio/wav": {"schema": {"type": "string", "format": "binary"}}
+                },
+            }
+        },
+        tags=["その他"],
+        summary="base64エンコードされた複数のwavデータを一つに結合する",
+    )
+    def connect_waves(waves: List[str]):
+        """
+        base64エンコードされたwavデータを一纏めにし、wavファイルで返します。
+        """
+        try:
+            waves_nparray, sampling_rate = connect_base64_waves(waves)
+        except ConnectBase64WavesException as err:
+            return HTTPException(status_code=422, detail=str(err))
+
+        with NamedTemporaryFile(delete=False) as f:
+            soundfile.write(
+                file=f,
+                data=waves_nparray,
+                samplerate=sampling_rate,
+                format="WAV",
+            )
+
+            return FileResponse(f.name, media_type="audio/wav")
+
+    @app.get("/presets", response_model=List[Preset], tags=["その他"])
+    def get_presets():
+        """
+        エンジンが保持しているプリセットの設定を返します
+
+        Returns
+        -------
+        presets: List[Preset]
+            プリセットのリスト
+        """
+        presets, err_detail = preset_loader.load_presets()
+        if err_detail:
+            raise HTTPException(status_code=422, detail=err_detail)
+        return presets
+
+    @app.get("/version", tags=["その他"])
+    def version() -> str:
+        return __version__
+
+    @app.get("/core_versions", response_model=List[str], tags=["その他"])
+    def core_versions() -> List[str]:
+        return Response(
+            content=json.dumps(list(synthesis_engines.keys())),
+            media_type="application/json",
+        )
+
+    @app.get("/speakers", response_model=List[Speaker], tags=["その他"])
+    def speakers(
+        core_version: Optional[str] = None,
+    ):
+        engine = get_engine(core_version)
+        return Response(
+            content=engine.speakers,
+            media_type="application/json",
+        )
+
+    @app.get("/speaker_info", response_model=SpeakerInfo, tags=["その他"])
+    def speaker_info(speaker_uuid: str, core_version: Optional[str] = None):
+        """
+        指定されたspeaker_uuidに関する情報をjson形式で返します。
+        画像や音声はbase64エンコードされたものが返されます。
+
+        Returns
+        -------
+        ret_data: SpeakerInfo
+        """
+        speakers = json.loads(get_engine(core_version).speakers)
+        for i in range(len(speakers)):
+            if speakers[i]["speaker_uuid"] == speaker_uuid:
+                speaker = speakers[i]
+                break
+        else:
+            raise HTTPException(status_code=404, detail="該当する話者が見つかりません")
+
+        try:
+            policy = (root_dir / f"speaker_info/{speaker_uuid}/policy.md").read_text(
+                "utf-8"
+            )
+            portrait = b64encode_str(
+                (root_dir / f"speaker_info/{speaker_uuid}/portrait.png").read_bytes()
+            )
+            style_infos = []
+            for style in speaker["styles"]:
+                id = style["id"]
+                icon = b64encode_str(
+                    (
+                        root_dir / f"speaker_info/{speaker_uuid}/icons/{id}.png"
+                    ).read_bytes()
+                )
+                voice_samples = [
+                    b64encode_str(
+                        (
+                            root_dir
+                            / "speaker_info/{}/voice_samples/{}_{}.wav".format(
+                                speaker_uuid, id, str(j + 1).zfill(3)
+                            )
+                        ).read_bytes()
+                    )
+                    for j in range(3)
+                ]
+                style_infos.append(
+                    {"id": id, "icon": icon, "voice_samples": voice_samples}
+                )
+        except FileNotFoundError:
+            import traceback
+
+            traceback.print_exc()
+            raise HTTPException(status_code=500, detail="追加情報が見つかりませんでした")
+
+        ret_data = {"policy": policy, "portrait": portrait, "style_infos": style_infos}
+        return ret_data
+
+    @app.get("/supported_devices", response_model=SupportedDevicesInfo, tags=["その他"])
+    def supported_devices(
+        core_version: Optional[str] = None,
+    ):
+        supported_devices = get_engine(core_version).supported_devices
+        if supported_devices is None:
+            raise HTTPException(status_code=422, detail="非対応の機能です。")
+        return Response(
+            content=supported_devices,
+            media_type="application/json",
+        )
+
+    return app
+
+
+if __name__ == "__main__":
+    multiprocessing.freeze_support()
+    rospy.init_node('voicevox_server')
+
+    voicelib_dir = [Path(voicevox_lib_dir)]
+    use_gpu = False
+    host = rospy.get_param('~host', "127.0.0.1")
+    port = rospy.get_param('~port', 50021)
+    cpu_num_threads = rospy.get_param('~cpu_num_threads', None)
+    if cpu_num_threads is None:
+        cpu_num_threads = multiprocessing.cpu_count()
+
+    synthesis_engines = make_synthesis_engines(
+        use_gpu=use_gpu,
+        voicelib_dirs=voicelib_dir,
+        cpu_num_threads=cpu_num_threads,
+    )
+    if len(synthesis_engines) == 0:
+        rospy.logerr("音声合成エンジンがありません。")
+    latest_core_version = str(max([LooseVersion(ver)
+                                   for ver in synthesis_engines]))
+
+    uvicorn.run(
+        generate_app(synthesis_engines, latest_core_version),
+        host=host,
+        port=port,
+    )
diff --git a/3rdparty/voicevox/package.xml b/3rdparty/voicevox/package.xml
new file mode 100644
index 000000000..5240c3468
--- /dev/null
+++ b/3rdparty/voicevox/package.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<?xml-model
+  href="http://download.ros.org/schema/package_format3.xsd"
+  schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>voicevox</name>
+  <version>0.0.1</version>
+  <description>VOICEVOX: AI speech synthesis</description>
+  <maintainer email="yanokura@jsk.imi.i.u-tokyo.ac.jp">Iori Yanokura</maintainer>
+
+  <license>MIT</license>
+
+  <url type="website">http://ros.org/wiki/voicevox</url>
+
+  <author>Iori Yanokura</author>
+
+  <buildtool_depend>catkin</buildtool_depend>
+  <build_depend>catkin_virtualenv</build_depend>
+
+  <build_depend>mk</build_depend>
+  <build_depend>roslib</build_depend>
+  <build_depend>rospack</build_depend>
+  <build_depend>unzip</build_depend>
+  <build_depend>wget</build_depend>
+
+  <exec_depend>python3</exec_depend>
+  <exec_depend>python3-requests</exec_depend>
+  <exec_depend>sound_play</exec_depend>
+  <exec_depend>unzip</exec_depend>
+  <exec_depend>wget</exec_depend>
+
+  <export>
+    <pip_requirements>requirements.txt</pip_requirements>
+  </export>
+
+</package>
diff --git a/3rdparty/voicevox/requirements.in b/3rdparty/voicevox/requirements.in
new file mode 100644
index 000000000..c9cfd223a
--- /dev/null
+++ b/3rdparty/voicevox/requirements.in
@@ -0,0 +1,11 @@
+PyYAML
+aiofiles
+appdirs
+fastapi
+git+https://github.com/VOICEVOX/pyopenjtalk@a85521a0a0f298f08d9e9b24987b3c77eb4aaff5#egg=pyopenjtalk
+numpy
+python-multipart
+pyworld
+scipy
+soundfile
+uvicorn