diff --git a/3rdparty/voicevox/.gitignore b/3rdparty/voicevox/.gitignore new file mode 100644 index 000000000..8cb8e60a3 --- /dev/null +++ b/3rdparty/voicevox/.gitignore @@ -0,0 +1,6 @@ +build +dict +lib +node_scripts/voicevox_engine +requirements.txt +!.gitignore diff --git a/3rdparty/voicevox/CMakeLists.txt b/3rdparty/voicevox/CMakeLists.txt new file mode 100644 index 000000000..1e7297b5d --- /dev/null +++ b/3rdparty/voicevox/CMakeLists.txt @@ -0,0 +1,71 @@ +cmake_minimum_required(VERSION 2.8.3) +project(voicevox) + +find_package(catkin REQUIRED + COMPONENTS + catkin_virtualenv +) + +set(INSTALL_DIR ${PROJECT_SOURCE_DIR}) + +catkin_package() + +catkin_generate_virtualenv( + INPUT_REQUIREMENTS requirements.in + PYTHON_INTERPRETER python3 + USE_SYSTEM_PACKAGES FALSE +) + +add_custom_command( + OUTPUT voicevox_model_installed + COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.model + MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum + INSTALL_DIR=${INSTALL_DIR} +) + + +add_custom_command( + OUTPUT voicevox_core_installed + COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.core + MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum + INSTALL_DIR=${INSTALL_DIR} +) + +add_custom_command( + OUTPUT voicevox_engine_installed + COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.engine + MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum + INSTALL_DIR=${INSTALL_DIR} +) + +add_custom_command( + OUTPUT open_jtalk_dic_installed + COMMAND make -f ${PROJECT_SOURCE_DIR}/Makefile.open_jtalk_dic + MD5SUM_DIR=${PROJECT_SOURCE_DIR}/md5sum + INSTALL_DIR=${INSTALL_DIR} +) + +add_custom_target(all_installed ALL DEPENDS + voicevox_model_installed + voicevox_core_installed + voicevox_engine_installed + open_jtalk_dic_installed) + +file(GLOB NODE_SCRIPTS_FILES node_scripts/*.py) +catkin_install_python( + PROGRAMS ${NODE_SCRIPTS_FILES} + DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}/node_scripts/ +) +install(DIRECTORY node_scripts/voicevox_engine + DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/catkin_virtualenv_scripts/ + USE_SOURCE_PERMISSIONS) +install(DIRECTORY launch dict + DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} + USE_SOURCE_PERMISSIONS) +install(PROGRAMS bin/text2wave + DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/bin) + +install(DIRECTORY + ${INSTALL_DIR}/lib + DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} + USE_SOURCE_PERMISSIONS) diff --git a/3rdparty/voicevox/Makefile b/3rdparty/voicevox/Makefile new file mode 100644 index 000000000..a2c90f3bb --- /dev/null +++ b/3rdparty/voicevox/Makefile @@ -0,0 +1,11 @@ +all: + make -f Makefile.core + make -f Makefile.model + make -f Makefile.engine + make -f Makefile.open_jtalk_dic +clean: + make -f Makefile.core clean + make -f Makefile.model clean + make -f Makefile.engine clean + make -f Makefile.open_jtalk_dic clean + rm -rf build diff --git a/3rdparty/voicevox/Makefile.core b/3rdparty/voicevox/Makefile.core new file mode 100644 index 000000000..bac21eb0f --- /dev/null +++ b/3rdparty/voicevox/Makefile.core @@ -0,0 +1,28 @@ +# -*- makefile -*- + +all: installed.viocevox_core + +VERSION = 0.11.4 +FILENAME = core.zip +TARBALL = build/$(FILENAME) +TARBALL_URL = "https://github.com/VOICEVOX/voicevox_core/releases/download/$(VERSION)/core.zip" +SOURCE_DIR = build/core +UNPACK_CMD = unzip +MD5SUM_DIR = $(CURDIR)/md5sum +MD5SUM_FILE = $(MD5SUM_DIR)/$(FILENAME).md5sum +SCRIPT_DIR = $( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +include $(shell rospack find mk)/download_unpack_build.mk +INSTALL_DIR = './' + + +installed.viocevox_core: $(SOURCE_DIR)/unpacked + mkdir -p $(INSTALL_DIR)/lib + cp build/core/lib*.so $(INSTALL_DIR)/lib/ + cp build/core/*.bin $(INSTALL_DIR)/lib/ + cp build/core/metas.json $(INSTALL_DIR)/lib/metas.json + +clean: + rm -rf $(TARBALL) + rm -rf $(SOURCE_DIR) + rm -rf $(INSTALL_DIR)/lib + rm -rf build diff --git a/3rdparty/voicevox/Makefile.engine b/3rdparty/voicevox/Makefile.engine new file mode 100644 index 000000000..b3d6899fa --- /dev/null +++ b/3rdparty/voicevox/Makefile.engine @@ -0,0 +1,24 @@ +# -*- makefile -*- + +all: installed.voicevox_engine + +VERSION = 0.11.4 +FILENAME = $(VERSION).tar.gz +TARBALL = build/$(FILENAME) +TARBALL_URL = "https://github.com/VOICEVOX/voicevox_engine/archive/refs/tags/$(FILENAME)" +SOURCE_DIR = build/voicevox_engine-$(VERSION) +UNPACK_CMD = tar xvzf +MD5SUM_DIR = $(CURDIR)/md5sum +MD5SUM_FILE = $(MD5SUM_DIR)/voicevox_engine.tar.gz.md5sum +include $(shell rospack find mk)/download_unpack_build.mk +INSTALL_DIR = './' + + +installed.voicevox_engine: $(SOURCE_DIR)/unpacked + cp -r build/voicevox_engine-$(VERSION) $(INSTALL_DIR)/node_scripts/voicevox_engine + +clean: + rm -rf $(TARBALL) + rm -rf $(SOURCE_DIR) + rm -rf $(INSTALL_DIR)/node_scripts/voicevox_engine + rm -rf build diff --git a/3rdparty/voicevox/Makefile.model b/3rdparty/voicevox/Makefile.model new file mode 100644 index 000000000..004028105 --- /dev/null +++ b/3rdparty/voicevox/Makefile.model @@ -0,0 +1,26 @@ +# -*- makefile -*- + +all: installed.voicevox_model + +VERSION = 1.10.0 +FILENAME = onnxruntime-linux-x64-$(VERSION).tgz +TARBALL = build/$(FILENAME) +TARBALL_URL = "https://github.com/microsoft/onnxruntime/releases/download/v$(VERSION)/$(FILENAME)" +SOURCE_DIR = build/onnxruntime-linux-x64-$(VERSION) +UNPACK_CMD = tar xvzf +MD5SUM_DIR = $(CURDIR)/md5sum +MD5SUM_FILE = $(MD5SUM_DIR)/$(FILENAME).md5sum +SCRIPT_DIR = $( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +include $(shell rospack find mk)/download_unpack_build.mk +INSTALL_DIR = './' + + +installed.voicevox_model: $(SOURCE_DIR)/unpacked + mkdir -p $(INSTALL_DIR)/lib + cp build/onnxruntime-linux-x64-$(VERSION)/lib/* $(INSTALL_DIR)/lib + +clean: + rm -rf $(TARBALL) + rm -rf $(SOURCE_DIR) + rm -rf $(INSTALL_DIR)/lib + rm -rf build diff --git a/3rdparty/voicevox/Makefile.open_jtalk_dic b/3rdparty/voicevox/Makefile.open_jtalk_dic new file mode 100644 index 000000000..646921159 --- /dev/null +++ b/3rdparty/voicevox/Makefile.open_jtalk_dic @@ -0,0 +1,25 @@ +# -*- makefile -*- + +all: installed.open_jtalk_dic + +VERSION = 1.11.1 +FILENAME = open_jtalk_dic_utf_8-1.11.tar.gz +TARBALL = build/$(FILENAME) +TARBALL_URL = "https://github.com/r9y9/open_jtalk/releases/download/v$(VERSION)/$(FILENAME)" +SOURCE_DIR = build/open_jtalk_dic_utf_8-1.11 +UNPACK_CMD = tar xvzf +MD5SUM_DIR = $(CURDIR)/md5sum +MD5SUM_FILE = $(MD5SUM_DIR)/open_jtalk_dic.tar.gz.md5sum +include $(shell rospack find mk)/download_unpack_build.mk +INSTALL_DIR = './' + + +installed.open_jtalk_dic: $(SOURCE_DIR)/unpacked + mkdir -p $(INSTALL_DIR)/dict + cp -r build/open_jtalk_dic_utf_8-1.11 $(INSTALL_DIR)/dict + +clean: + rm -rf $(TARBALL) + rm -rf $(SOURCE_DIR) + rm -rf $(INSTALL_DIR)/dict/open_jtalk_dic_utf_8-1.11 + rm -rf build diff --git a/3rdparty/voicevox/README.md b/3rdparty/voicevox/README.md new file mode 100644 index 000000000..d5602db71 --- /dev/null +++ b/3rdparty/voicevox/README.md @@ -0,0 +1,103 @@ +# voicevox + +ROS Interface for [VOICEVOX](https://voicevox.hiroshiba.jp/) (AI speech synthesis) + +## TERM + +[VOICEVOX](https://voicevox.hiroshiba.jp/) is basically free to use, but please check the terms of use below. + +[TERM](https://voicevox.hiroshiba.jp/term) + +Each voice synthesis character has its own rules. Please use this package according to those terms. + +| Character name | term link | +| ---- | ---- | +| 四国めたん | https://zunko.jp/con_ongen_kiyaku.html | +| ずんだもん | https://zunko.jp/con_ongen_kiyaku.html | +| 春日部つむぎ | https://tsukushinyoki10.wixsite.com/ktsumugiofficial/利用規約 | +| 波音リツ | http://canon-voice.com/kiyaku.html | +| 雨晴はう | https://amehau.com/?page_id=225 | +| 玄野武宏 | https://virvoxproject.wixsite.com/official/voicevoxの利用規約 | +| 白上虎太郎 | https://virvoxproject.wixsite.com/official/voicevoxの利用規約 | +| 青山龍星 | https://virvoxproject.wixsite.com/official/voicevoxの利用規約 | +| 冥鳴ひまり | https://kotoran8zunzun.wixsite.com/my-site/利用規約 | +| 九州そら | https://zunko.jp/con_ongen_kiyaku.html | + +## Installation + +Build this package. + +```bash +cd /path/to/catkin_workspace +catkin build voicevox +``` + +## Usage + +### Launch sound_play with VOICEVOX Text-to-Speech + +```bash +roslaunch voicevox voicevox_texttospeech.launch +``` + + +### Say something + +#### For python users + +```python +import rospy +from sound_play.libsoundplay import SoundClient + +rospy.init_node('say_node') + +client = SoundClient(sound_action='robotsound_jp', sound_topic='robotsound_jp') + +client.say('こんにちは', voice='四国めたん-あまあま') +``` + +You can change the voice by changing the voice_name. +You can also specify the speaker id. +Look at the following tables for further details. + +| speaker_id | voice_name | +| ---- | ---- | +| 0 | 四国めたん-あまあま | +| 1 | ずんだもん-あまあま | +| 2 | 四国めたん-ノーマル | +| 3 | ずんだもん-ノーマル | +| 4 | 四国めたん-セクシー | +| 5 | ずんだもん-セクシー | +| 6 | 四国めたん-ツンツン | +| 7 | ずんだもん-ツンツン | +| 8 | 春日部つむぎ-ノーマル | +| 9 | 波音リツ-ノーマル | +| 10 | 雨晴はう-ノーマル | +| 11 | 玄野武宏-ノーマル | +| 12 | 白上虎太郎-ノーマル | +| 13 | 青山龍星-ノーマル | +| 14 | 冥鳴ひまり-ノーマル | +| 15 | 九州そら-あまあま | +| 16 | 九州そら-ノーマル | +| 17 | 九州そら-セクシー | +| 18 | 九州そら-ツンツン | +| 19 | 九州そら-ささやき | + +#### For roseus users + +``` +$ roseus +(load "package://pr2eus/speak.l") + +(ros::roseus "say_node") + +(speak "JSKへようこそ。" :lang "波音リツ" :wait t :topic-name "robotsound_jp") +``` + +### Tips + +Normally, the server for speech synthesis starts up at `http://localhost:50021`. +You can change the url and port by setting values for `VOICEVOX_TEXTTOSPEECH_URL` and `VOICEVOX_TEXTTOSPEECH_PORT`. + +You can also set the default character by setting `VOICEVOX_DEFAULT_SPEAKER_ID`. +Please refer to [here](#saysomething) for the speaker id. diff --git a/3rdparty/voicevox/bin/text2wave b/3rdparty/voicevox/bin/text2wave new file mode 100755 index 000000000..1e3fe7236 --- /dev/null +++ b/3rdparty/voicevox/bin/text2wave @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import argparse +import os + +import requests + + +speaker_id_to_name = { + '0': '四国めたん-あまあま', + '1': 'ずんだもん-あまあま', + '2': '四国めたん-ノーマル', + '3': 'ずんだもん-ノーマル', + '4': '四国めたん-セクシー', + '5': 'ずんだもん-セクシー', + '6': '四国めたん-ツンツン', + '7': 'ずんだもん-ツンツン', + '8': '春日部つむぎ-ノーマル', + '9': '波音リツ-ノーマル', + '10': '雨晴はう-ノーマル', + '11': '玄野武宏-ノーマル', + '12': '白上虎太郎-ノーマル', + '13': '青山龍星-ノーマル', + '14': '冥鳴ひまり-ノーマル', + '15': '九州そら-あまあま', + '16': '九州そら-ノーマル', + '17': '九州そら-セクシー', + '18': '九州そら-ツンツン', + '19': '九州そら-ささやき', +} + +name_to_speaker_id = { + b: a for a, b in speaker_id_to_name.items() +} + + +DEFAULT_SPEAKER_ID = os.environ.get( + 'VOICEVOX_DEFAULT_SPEAKER_ID', '2') +if not DEFAULT_SPEAKER_ID.isdigit(): + DEFAULT_SPEAKER_ID = name_to_speaker_id[DEFAULT_SPEAKER_ID] +VOICEVOX_TEXTTOSPEECH_URL = os.environ.get( + 'VOICEVOX_TEXTTOSPEECH_URL', 'localhost') +VOICEVOX_TEXTTOSPEECH_PORT = os.environ.get( + 'VOICEVOX_TEXTTOSPEECH_PORT', 50021) + + +def determine_voice_name(voice_name): + if len(voice_name) == 0: + speaker_id = DEFAULT_SPEAKER_ID + else: + if voice_name.isdigit(): + if voice_name in speaker_id_to_name: + speaker_id = voice_name + else: + print( + '[Text2Wave] Invalid speaker_id ({}). Use default voice.' + .format(speaker_id_to_name[DEFAULT_SPEAKER_ID])) + speaker_id = DEFAULT_SPEAKER_ID + else: + candidates = list(filter( + lambda name: name.startswith(voice_name), + name_to_speaker_id)) + if candidates: + speaker_id = name_to_speaker_id[candidates[0]] + else: + print('[Text2Wave] Invalid voice_name ({}). Use default voice.' + .format(speaker_id_to_name[DEFAULT_SPEAKER_ID])) + speaker_id = DEFAULT_SPEAKER_ID + print('[Text2Wave] Speak using voice_name ({})..'.format( + speaker_id_to_name[speaker_id])) + return speaker_id + + +def convert_to_str(x): + if isinstance(x, str): + pass + elif isinstance(x, bytes): + x = x.decode('utf-8') + else: + raise ValueError( + 'Invalid input x type: {}' + .format(type(x))) + return x + + +def request_synthesis( + sentence, output_path, speaker_id='1'): + headers = {'accept': 'application/json'} + + sentence = convert_to_str(sentence) + speaker_id = convert_to_str(speaker_id) + params = { + 'speaker': speaker_id, + 'text': sentence, + } + base_url = 'http://{}:{}'.format( + VOICEVOX_TEXTTOSPEECH_URL, + VOICEVOX_TEXTTOSPEECH_PORT) + url = '{}/audio_query'.format(base_url) + response = requests.post(url, headers=headers, + params=params) + data = response.json() + url = '{}/synthesis'.format(base_url) + response = requests.post(url, headers=headers, + params=params, + json=data) + with open(output_path, 'wb') as f: + f.write(response.content) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='') + parser.add_argument('-eval', '--evaluate') + parser.add_argument('-o', '--output') + parser.add_argument('text') + args = parser.parse_args() + + with open(args.text, 'rb') as f: + speech_text = f.readline() + + speaker_id = determine_voice_name( + args.evaluate.lstrip('(').rstrip(')')) + request_synthesis(speech_text, + args.output, + speaker_id) diff --git a/3rdparty/voicevox/launch/voicevox_texttospeech.launch b/3rdparty/voicevox/launch/voicevox_texttospeech.launch new file mode 100644 index 000000000..15de7c551 --- /dev/null +++ b/3rdparty/voicevox/launch/voicevox_texttospeech.launch @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + diff --git a/3rdparty/voicevox/md5sum/core.zip.md5sum b/3rdparty/voicevox/md5sum/core.zip.md5sum new file mode 100644 index 000000000..f5b5ac439 --- /dev/null +++ b/3rdparty/voicevox/md5sum/core.zip.md5sum @@ -0,0 +1 @@ +96149a074d8ee093039321a88e00076d core.zip diff --git a/3rdparty/voicevox/md5sum/onnxruntime-linux-x64-1.10.0.tgz.md5sum b/3rdparty/voicevox/md5sum/onnxruntime-linux-x64-1.10.0.tgz.md5sum new file mode 100644 index 000000000..817b68d89 --- /dev/null +++ b/3rdparty/voicevox/md5sum/onnxruntime-linux-x64-1.10.0.tgz.md5sum @@ -0,0 +1 @@ +9ca61e2009a16cf8a1e9ab9ad0655009 onnxruntime-linux-x64-1.10.0.tgz diff --git a/3rdparty/voicevox/md5sum/open_jtalk_dic.tar.gz.md5sum b/3rdparty/voicevox/md5sum/open_jtalk_dic.tar.gz.md5sum new file mode 100644 index 000000000..8ce4bb07b --- /dev/null +++ b/3rdparty/voicevox/md5sum/open_jtalk_dic.tar.gz.md5sum @@ -0,0 +1 @@ +ba02dac4143492c3790f949be224dfdf open_jtalk_dic_utf_8-1.11.tar.gz diff --git a/3rdparty/voicevox/md5sum/voicevox_engine.tar.gz.md5sum b/3rdparty/voicevox/md5sum/voicevox_engine.tar.gz.md5sum new file mode 100644 index 000000000..5947e3633 --- /dev/null +++ b/3rdparty/voicevox/md5sum/voicevox_engine.tar.gz.md5sum @@ -0,0 +1 @@ +997bf9e915f7d6288c923ab1ff5f4ff6 0.11.4.tar.gz diff --git a/3rdparty/voicevox/node_scripts/server.py b/3rdparty/voicevox/node_scripts/server.py new file mode 100644 index 000000000..add596aff --- /dev/null +++ b/3rdparty/voicevox/node_scripts/server.py @@ -0,0 +1,573 @@ +#!/usr/bin/env python3 + +# This code was created based on the following link's code. +# https://github.com/VOICEVOX/voicevox_engine/blob/0.11.4/run.py + +import base64 +from distutils.version import LooseVersion +from functools import lru_cache +import imp +import json +import multiprocessing +import os +import os.path as osp +from pathlib import Path +from tempfile import NamedTemporaryFile +from tempfile import TemporaryFile +from typing import Dict +from typing import List +from typing import Optional +import zipfile + +from fastapi import FastAPI +from fastapi import HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.params import Query +from fastapi import Response +import rospkg +import rospy +import soundfile +from starlette.responses import FileResponse +import uvicorn + + +PKG_NAME = 'voicevox' +abs_path = osp.dirname(osp.abspath(__file__)) +voicevox_engine = imp.load_package( + 'voicevox_engine', osp.join(abs_path, 'voicevox_engine/voicevox_engine')) +rospack = rospkg.RosPack() +voicevox_dir = rospack.get_path(PKG_NAME) +voicevox_lib_dir = osp.join(voicevox_dir, 'lib') +# set pyopenjtalk's dic.tar.gz file +os.environ['OPEN_JTALK_DICT_DIR'] = osp.join( + voicevox_dir, 'dict', 'open_jtalk_dic_utf_8-1.11') + + +from voicevox_engine import __version__ +from voicevox_engine.kana_parser import create_kana +from voicevox_engine.kana_parser import parse_kana +from voicevox_engine.model import AccentPhrase +from voicevox_engine.model import AudioQuery +from voicevox_engine.model import ParseKanaBadRequest +from voicevox_engine.model import ParseKanaError +from voicevox_engine.model import Speaker +from voicevox_engine.model import SpeakerInfo +from voicevox_engine.model import SupportedDevicesInfo +from voicevox_engine.morphing import \ + synthesis_morphing_parameter as _synthesis_morphing_parameter +from voicevox_engine.morphing import synthesis_morphing +from voicevox_engine.preset import Preset +from voicevox_engine.preset import PresetLoader +from voicevox_engine.synthesis_engine import make_synthesis_engines +from voicevox_engine.synthesis_engine import SynthesisEngineBase +from voicevox_engine.user_dict import user_dict_startup_processing +from voicevox_engine.utility import connect_base64_waves +from voicevox_engine.utility import ConnectBase64WavesException +from voicevox_engine.utility import engine_root + + +def b64encode_str(s): + return base64.b64encode(s).decode("utf-8") + + +def generate_app( + synthesis_engines: Dict[str, SynthesisEngineBase], latest_core_version: str +) -> FastAPI: + root_dir = engine_root() + + default_sampling_rate = synthesis_engines[latest_core_version].default_sampling_rate + + app = FastAPI( + title="VOICEVOX ENGINE", + description="VOICEVOXの音声合成エンジンです。", + version=__version__, + ) + + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + preset_loader = PresetLoader( + preset_path=root_dir / "presets.yaml", + ) + + # キャッシュを有効化 + # モジュール側でlru_cacheを指定するとキャッシュを制御しにくいため、HTTPサーバ側で指定する + # TODO: キャッシュを管理するモジュール側API・HTTP側APIを用意する + synthesis_morphing_parameter = lru_cache(maxsize=4)(_synthesis_morphing_parameter) + + # @app.on_event("startup") + # async def start_catch_disconnection(): + # if args.enable_cancellable_synthesis: + # loop = asyncio.get_event_loop() + # _ = loop.create_task(cancellable_engine.catch_disconnection()) + + @app.on_event("startup") + def apply_user_dict(): + user_dict_startup_processing() + + def get_engine(core_version: Optional[str]) -> SynthesisEngineBase: + if core_version is None: + return synthesis_engines[latest_core_version] + if core_version in synthesis_engines: + return synthesis_engines[core_version] + raise HTTPException(status_code=422, detail="不明なバージョンです") + + @app.post( + "/audio_query", + response_model=AudioQuery, + tags=["クエリ作成"], + summary="音声合成用のクエリを作成する", + ) + def audio_query(text: str, speaker: int, core_version: Optional[str] = None): + """ + クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。 + """ + engine = get_engine(core_version) + accent_phrases = engine.create_accent_phrases(text, speaker_id=speaker) + return AudioQuery( + accent_phrases=accent_phrases, + speedScale=1, + pitchScale=0, + intonationScale=1, + volumeScale=1, + prePhonemeLength=0.1, + postPhonemeLength=0.1, + outputSamplingRate=default_sampling_rate, + outputStereo=False, + kana=create_kana(accent_phrases), + ) + + @app.post( + "/audio_query_from_preset", + response_model=AudioQuery, + tags=["クエリ作成"], + summary="音声合成用のクエリをプリセットを用いて作成する", + ) + def audio_query_from_preset( + text: str, preset_id: int, core_version: Optional[str] = None + ): + """ + クエリの初期値を得ます。ここで得られたクエリはそのまま音声合成に利用できます。各値の意味は`Schemas`を参照してください。 + """ + engine = get_engine(core_version) + presets, err_detail = preset_loader.load_presets() + if err_detail: + raise HTTPException(status_code=422, detail=err_detail) + for preset in presets: + if preset.id == preset_id: + selected_preset = preset + break + else: + raise HTTPException(status_code=422, detail="該当するプリセットIDが見つかりません") + + accent_phrases = engine.create_accent_phrases( + text, speaker_id=selected_preset.style_id + ) + return AudioQuery( + accent_phrases=accent_phrases, + speedScale=selected_preset.speedScale, + pitchScale=selected_preset.pitchScale, + intonationScale=selected_preset.intonationScale, + volumeScale=selected_preset.volumeScale, + prePhonemeLength=selected_preset.prePhonemeLength, + postPhonemeLength=selected_preset.postPhonemeLength, + outputSamplingRate=default_sampling_rate, + outputStereo=False, + kana=create_kana(accent_phrases), + ) + + @app.post( + "/accent_phrases", + response_model=List[AccentPhrase], + tags=["クエリ編集"], + summary="テキストからアクセント句を得る", + responses={ + 400: { + "description": "読み仮名のパースに失敗", + "model": ParseKanaBadRequest, + } + }, + ) + def accent_phrases( + text: str, + speaker: int, + is_kana: bool = False, + core_version: Optional[str] = None, + ): + """ + テキストからアクセント句を得ます。 + is_kanaが`true`のとき、テキストは次のようなAquesTalkライクな記法に従う読み仮名として処理されます。デフォルトは`false`です。 + * 全てのカナはカタカナで記述される + * アクセント句は`/`または`、`で区切る。`、`で区切った場合に限り無音区間が挿入される。 + * カナの手前に`_`を入れるとそのカナは無声化される + * アクセント位置を`'`で指定する。全てのアクセント句にはアクセント位置を1つ指定する必要がある。 + * アクセント句末に`?`(全角)を入れることにより疑問文の発音ができる。 + """ + engine = get_engine(core_version) + if is_kana: + try: + accent_phrases = parse_kana(text) + except ParseKanaError as err: + raise HTTPException( + status_code=400, + detail=ParseKanaBadRequest(err).dict(), + ) + accent_phrases = engine.replace_mora_data( + accent_phrases=accent_phrases, speaker_id=speaker + ) + + return accent_phrases + else: + return engine.create_accent_phrases(text, speaker_id=speaker) + + @app.post( + "/mora_data", + response_model=List[AccentPhrase], + tags=["クエリ編集"], + summary="アクセント句から音高・音素長を得る", + ) + def mora_data( + accent_phrases: List[AccentPhrase], + speaker: int, + core_version: Optional[str] = None, + ): + engine = get_engine(core_version) + return engine.replace_mora_data(accent_phrases, speaker_id=speaker) + + @app.post( + "/mora_length", + response_model=List[AccentPhrase], + tags=["クエリ編集"], + summary="アクセント句から音素長を得る", + ) + def mora_length( + accent_phrases: List[AccentPhrase], + speaker: int, + core_version: Optional[str] = None, + ): + engine = get_engine(core_version) + return engine.replace_phoneme_length( + accent_phrases=accent_phrases, speaker_id=speaker + ) + + @app.post( + "/mora_pitch", + response_model=List[AccentPhrase], + tags=["クエリ編集"], + summary="アクセント句から音高を得る", + ) + def mora_pitch( + accent_phrases: List[AccentPhrase], + speaker: int, + core_version: Optional[str] = None, + ): + engine = get_engine(core_version) + return engine.replace_mora_pitch( + accent_phrases=accent_phrases, speaker_id=speaker + ) + + @app.post( + "/synthesis", + response_class=FileResponse, + responses={ + 200: { + "content": { + "audio/wav": {"schema": {"type": "string", "format": "binary"}} + }, + } + }, + tags=["音声合成"], + summary="音声合成する", + ) + def synthesis( + query: AudioQuery, + speaker: int, + enable_interrogative_upspeak: bool = Query( # noqa: B008 + default=True, + description="疑問系のテキストが与えられたら語尾を自動調整する", + ), + core_version: Optional[str] = None, + ): + engine = get_engine(core_version) + wave = engine.synthesis( + query=query, + speaker_id=speaker, + enable_interrogative_upspeak=enable_interrogative_upspeak, + ) + + with NamedTemporaryFile(delete=False) as f: + soundfile.write( + file=f, data=wave, samplerate=query.outputSamplingRate, format="WAV" + ) + + return FileResponse(f.name, media_type="audio/wav") + + @app.post( + "/multi_synthesis", + response_class=FileResponse, + responses={ + 200: { + "content": { + "application/zip": { + "schema": {"type": "string", "format": "binary"} + } + }, + } + }, + tags=["音声合成"], + summary="複数まとめて音声合成する", + ) + def multi_synthesis( + queries: List[AudioQuery], + speaker: int, + core_version: Optional[str] = None, + ): + engine = get_engine(core_version) + sampling_rate = queries[0].outputSamplingRate + + with NamedTemporaryFile(delete=False) as f: + + with zipfile.ZipFile(f, mode="a") as zip_file: + + for i in range(len(queries)): + + if queries[i].outputSamplingRate != sampling_rate: + raise HTTPException( + status_code=422, detail="サンプリングレートが異なるクエリがあります" + ) + + with TemporaryFile() as wav_file: + + wave = engine.synthesis(query=queries[i], speaker_id=speaker) + soundfile.write( + file=wav_file, + data=wave, + samplerate=sampling_rate, + format="WAV", + ) + wav_file.seek(0) + zip_file.writestr(f"{str(i + 1).zfill(3)}.wav", wav_file.read()) + + return FileResponse(f.name, media_type="application/zip") + + @app.post( + "/synthesis_morphing", + response_class=FileResponse, + responses={ + 200: { + "content": { + "audio/wav": {"schema": {"type": "string", "format": "binary"}} + }, + } + }, + tags=["音声合成"], + summary="2人の話者でモーフィングした音声を合成する", + ) + def _synthesis_morphing( + query: AudioQuery, + base_speaker: int, + target_speaker: int, + morph_rate: float = Query(..., ge=0.0, le=1.0), # noqa: B008 + core_version: Optional[str] = None, + ): + """ + 指定された2人の話者で音声を合成、指定した割合でモーフィングした音声を得ます。 + モーフィングの割合は`morph_rate`で指定でき、0.0でベースの話者、1.0でターゲットの話者に近づきます。 + """ + engine = get_engine(core_version) + + # 生成したパラメータはキャッシュされる + morph_param = synthesis_morphing_parameter( + engine=engine, + query=query, + base_speaker=base_speaker, + target_speaker=target_speaker, + ) + + morph_wave = synthesis_morphing( + morph_param=morph_param, + morph_rate=morph_rate, + output_stereo=query.outputStereo, + ) + + with NamedTemporaryFile(delete=False) as f: + soundfile.write( + file=f, + data=morph_wave, + samplerate=morph_param.fs, + format="WAV", + ) + + return FileResponse(f.name, media_type="audio/wav") + + @app.post( + "/connect_waves", + response_class=FileResponse, + responses={ + 200: { + "content": { + "audio/wav": {"schema": {"type": "string", "format": "binary"}} + }, + } + }, + tags=["その他"], + summary="base64エンコードされた複数のwavデータを一つに結合する", + ) + def connect_waves(waves: List[str]): + """ + base64エンコードされたwavデータを一纏めにし、wavファイルで返します。 + """ + try: + waves_nparray, sampling_rate = connect_base64_waves(waves) + except ConnectBase64WavesException as err: + return HTTPException(status_code=422, detail=str(err)) + + with NamedTemporaryFile(delete=False) as f: + soundfile.write( + file=f, + data=waves_nparray, + samplerate=sampling_rate, + format="WAV", + ) + + return FileResponse(f.name, media_type="audio/wav") + + @app.get("/presets", response_model=List[Preset], tags=["その他"]) + def get_presets(): + """ + エンジンが保持しているプリセットの設定を返します + + Returns + ------- + presets: List[Preset] + プリセットのリスト + """ + presets, err_detail = preset_loader.load_presets() + if err_detail: + raise HTTPException(status_code=422, detail=err_detail) + return presets + + @app.get("/version", tags=["その他"]) + def version() -> str: + return __version__ + + @app.get("/core_versions", response_model=List[str], tags=["その他"]) + def core_versions() -> List[str]: + return Response( + content=json.dumps(list(synthesis_engines.keys())), + media_type="application/json", + ) + + @app.get("/speakers", response_model=List[Speaker], tags=["その他"]) + def speakers( + core_version: Optional[str] = None, + ): + engine = get_engine(core_version) + return Response( + content=engine.speakers, + media_type="application/json", + ) + + @app.get("/speaker_info", response_model=SpeakerInfo, tags=["その他"]) + def speaker_info(speaker_uuid: str, core_version: Optional[str] = None): + """ + 指定されたspeaker_uuidに関する情報をjson形式で返します。 + 画像や音声はbase64エンコードされたものが返されます。 + + Returns + ------- + ret_data: SpeakerInfo + """ + speakers = json.loads(get_engine(core_version).speakers) + for i in range(len(speakers)): + if speakers[i]["speaker_uuid"] == speaker_uuid: + speaker = speakers[i] + break + else: + raise HTTPException(status_code=404, detail="該当する話者が見つかりません") + + try: + policy = (root_dir / f"speaker_info/{speaker_uuid}/policy.md").read_text( + "utf-8" + ) + portrait = b64encode_str( + (root_dir / f"speaker_info/{speaker_uuid}/portrait.png").read_bytes() + ) + style_infos = [] + for style in speaker["styles"]: + id = style["id"] + icon = b64encode_str( + ( + root_dir / f"speaker_info/{speaker_uuid}/icons/{id}.png" + ).read_bytes() + ) + voice_samples = [ + b64encode_str( + ( + root_dir + / "speaker_info/{}/voice_samples/{}_{}.wav".format( + speaker_uuid, id, str(j + 1).zfill(3) + ) + ).read_bytes() + ) + for j in range(3) + ] + style_infos.append( + {"id": id, "icon": icon, "voice_samples": voice_samples} + ) + except FileNotFoundError: + import traceback + + traceback.print_exc() + raise HTTPException(status_code=500, detail="追加情報が見つかりませんでした") + + ret_data = {"policy": policy, "portrait": portrait, "style_infos": style_infos} + return ret_data + + @app.get("/supported_devices", response_model=SupportedDevicesInfo, tags=["その他"]) + def supported_devices( + core_version: Optional[str] = None, + ): + supported_devices = get_engine(core_version).supported_devices + if supported_devices is None: + raise HTTPException(status_code=422, detail="非対応の機能です。") + return Response( + content=supported_devices, + media_type="application/json", + ) + + return app + + +if __name__ == "__main__": + multiprocessing.freeze_support() + rospy.init_node('voicevox_server') + + voicelib_dir = [Path(voicevox_lib_dir)] + use_gpu = False + host = rospy.get_param('~host', "127.0.0.1") + port = rospy.get_param('~port', 50021) + cpu_num_threads = rospy.get_param('~cpu_num_threads', None) + if cpu_num_threads is None: + cpu_num_threads = multiprocessing.cpu_count() + + synthesis_engines = make_synthesis_engines( + use_gpu=use_gpu, + voicelib_dirs=voicelib_dir, + cpu_num_threads=cpu_num_threads, + ) + if len(synthesis_engines) == 0: + rospy.logerr("音声合成エンジンがありません。") + latest_core_version = str(max([LooseVersion(ver) + for ver in synthesis_engines])) + + uvicorn.run( + generate_app(synthesis_engines, latest_core_version), + host=host, + port=port, + ) diff --git a/3rdparty/voicevox/package.xml b/3rdparty/voicevox/package.xml new file mode 100644 index 000000000..5240c3468 --- /dev/null +++ b/3rdparty/voicevox/package.xml @@ -0,0 +1,36 @@ + + + + voicevox + 0.0.1 + VOICEVOX: AI speech synthesis + Iori Yanokura + + MIT + + http://ros.org/wiki/voicevox + + Iori Yanokura + + catkin + catkin_virtualenv + + mk + roslib + rospack + unzip + wget + + python3 + python3-requests + sound_play + unzip + wget + + + requirements.txt + + + diff --git a/3rdparty/voicevox/requirements.in b/3rdparty/voicevox/requirements.in new file mode 100644 index 000000000..c9cfd223a --- /dev/null +++ b/3rdparty/voicevox/requirements.in @@ -0,0 +1,11 @@ +PyYAML +aiofiles +appdirs +fastapi +git+https://github.com/VOICEVOX/pyopenjtalk@a85521a0a0f298f08d9e9b24987b3c77eb4aaff5#egg=pyopenjtalk +numpy +python-multipart +pyworld +scipy +soundfile +uvicorn