Added viseme support for TTS, allowing enclosure to display visemes (#…

…357) * Added viseme support for TTS, allowing enclosure to display visemes as appropriate * Enclosure versino bump
MycroftAI · Sep 5, 2016 · c653c43 · c653c43
1 parent 340542d
commit c653c43
Show file tree

Hide file tree

Showing 10 changed files with 140 additions and 11 deletions.
diff --git a/mycroft/client/enclosure/api.py b/mycroft/client/enclosure/api.py
@@ -96,9 +96,15 @@ def mouth_listen(self):
     def mouth_smile(self):
         self.client.emit(Message("enclosure.mouth.smile"))
 
+    def mouth_viseme(self, visCode):
+        self.client.emit(
+            Message("enclosure.mouth.viseme", metadata={
+                   'code': visCode}))
+
     def mouth_text(self, text=""):
         self.client.emit(
-            Message("enclosure.mouth.text", metadata={'text': text}))
+            Message("enclosure.mouth.text", metadata={
+                   'text': text}))
 
     def weather_display(self, img_code, temp):
         self.client.emit(

diff --git a/mycroft/client/enclosure/mouth.py b/mycroft/client/enclosure/mouth.py
@@ -17,6 +17,7 @@
 
 
 from mycroft.util.log import getLogger
+import time
 
 __author__ = 'jdorleans'
 
@@ -41,6 +42,7 @@ def __init_events(self):
         self.client.on('enclosure.mouth.think', self.think)
         self.client.on('enclosure.mouth.listen', self.listen)
         self.client.on('enclosure.mouth.smile', self.smile)
+        self.client.on('enclosure.mouth.viseme', self.viseme)
         self.client.on('enclosure.mouth.text', self.text)
 
     def reset(self, event=None):
@@ -58,6 +60,22 @@ def listen(self, event=None):
     def smile(self, event=None):
         self.writer.write("mouth.smile")
 
+    def viseme(self, event=None):
+        visCmds = ''
+        if event and event.metadata:
+            visCmds = event.metadata.get("code", visCmds)
+            # visCmds will be string of viseme codes and cumulative durations
+            # ex:  '0:0.34,1:1.23,0:1.32,'
+            lisPairs = visCmds.split(",")
+            timeStart = time.time()
+            for pair in lisPairs:
+                vis_dur = pair.split(":")
+                if vis_dur[0] >= "0" and vis_dur[0] <= "6":
+                    elap = time.time() - timeStart
+                    self.writer.write("mouth.viseme=" + vis_dur[0])
+                    if elap < float(vis_dur[1]):
+                        time.sleep(float(vis_dur[1]) - elap)
+
     def text(self, event=None):
         text = ""
         if event and event.metadata:

diff --git a/mycroft/client/enclosure/version.txt b/mycroft/client/enclosure/version.txt
@@ -1 +1 @@
-0.1.12
+0.1.13
diff --git a/mycroft/client/speech/main.py b/mycroft/client/speech/main.py
@@ -64,7 +64,7 @@ def mute_and_speak(utterance):
     try:
         logger.info("Speak: " + utterance)
         loop.mute()
-        tts.execute(utterance)
+        tts.execute(utterance, client)
     finally:
         loop.unmute()
         mutex.release()

diff --git a/mycroft/tts/__init__.py b/mycroft/tts/__init__.py
@@ -41,7 +41,7 @@ def __init__(self, lang, voice, filename='/tmp/tts.wav'):
         self.filename = filename
 
     @abc.abstractmethod
-    def execute(self, sentence):
+    def execute(self, sentence, client):
         pass
 
 

diff --git a/mycroft/tts/espeak_tts.py b/mycroft/tts/espeak_tts.py
@@ -29,7 +29,7 @@ class ESpeak(TTS):
     def __init__(self, lang, voice):
         super(ESpeak, self).__init__(lang, voice)
 
-    def execute(self, sentence):
+    def execute(self, sentence, client):
         subprocess.call(
             ['espeak', '-v', self.lang + '+' + self.voice, sentence])
 

diff --git a/mycroft/tts/google_tts.py b/mycroft/tts/google_tts.py
@@ -30,7 +30,7 @@ class GoogleTTS(TTS):
     def __init__(self, lang, voice):
         super(GoogleTTS, self).__init__(lang, voice)
 
-    def execute(self, sentence):
+    def execute(self, sentence, client):
         tts = gTTS(text=sentence, lang=self.lang)
         tts.save(self.filename)
         play_wav(self.filename)

diff --git a/mycroft/tts/mimic_tts.py b/mycroft/tts/mimic_tts.py
@@ -15,14 +15,17 @@
 # You should have received a copy of the GNU General Public License
 # along with Mycroft Core.  If not, see <http://www.gnu.org/licenses/>.
 
-
 import subprocess
 from os.path import join
 import re
+import random
+import os
+import time
 
 from mycroft import MYCROFT_ROOT_PATH
 from mycroft.tts import TTS, TTSValidator
 from mycroft.configuration import ConfigurationManager
+from mycroft.client.enclosure.api import EnclosureAPI
 
 __author__ = 'jdorleans'
 
@@ -32,13 +35,115 @@
 BIN = config.get(
     "mimic.path", join(MYCROFT_ROOT_PATH, 'mimic', 'bin', 'mimic'))
 
+# Mapping based on Jeffers phoneme to viseme map, seen in table 1 from:
+# http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.221.6377&rep=rep1&type=pdf
+#
+# Mycroft unit visemes based on images found at:
+#   http://www.web3.lu/wp-content/uploads/2014/09/visemes.jpg
+# and mapping was created partially based on the "12 mouth shapes"
+# visuals seen at:
+#   https://wolfpaulus.com/journal/software/lipsynchronization/
+# with final viseme group to image mapping by Steve Penrod
+
 
 class Mimic(TTS):
+
     def __init__(self, lang, voice):
         super(Mimic, self).__init__(lang, voice)
 
-    def execute(self, sentence):
-        subprocess.call([BIN, '-voice', self.voice, '-t', sentence])
+    def PhonemeToViseme(self, pho):
+        return {
+            # /A group
+            'v': '5',
+            'f': '5',
+            # /B group
+            'uh': '2',
+            'w': '2',
+            'uw': '2',
+            'er': '2',
+            'r': '2',
+            'ow': '2',
+            # /C group
+            'b': '4',
+            'p': '4',
+            'm': '4',
+            # /D group
+            'aw': '1',
+            # /E group
+            'th': '3',
+            'dh': '3',
+            # /F group
+            'zh': '3',
+            'ch': '3',
+            'sh': '3',
+            'jh': '3',
+            # /G group
+            'oy': '6',
+            'ao': '6',
+            # /Hgroup
+            'z': '3',
+            's': '3',
+            # /I group
+            'ae': '0',
+            'eh': '0',
+            'ey': '0',
+            'ah': '0',
+            'ih': '0',
+            'y': '0',
+            'iy': '0',
+            'aa': '0',
+            'ay': '0',
+            'ax': '0',
+            'hh': '0',
+            # /J group
+            'n': '3',
+            't': '3',
+            'd': '3',
+            'l': '3',
+            # /K group
+            'g': '3',
+            'ng': '3',
+            'k': '3',
+            # blank mouth
+            'pau': '4',
+        }.get(pho, '4')    # 4 is default if pho not found
+
+    def execute(self, sentence, client):
+        enclosure = EnclosureAPI(client)
+
+        random.seed()
+        # blink 50% of the time before speaking (only shows up if the
+        # mimic TTS generation takes fairly long)
+        if (random.random() < 0.5):
+            enclosure.eyes_blink("b")
+
+        # invoke mimic, creating WAV and outputting phoneme:duration pairs
+        outMimic = subprocess.check_output([BIN, '-voice', self.voice, '-t',
+                                            sentence, '-psdur', "-o",
+                                            "/tmp/mimic.wav"])
+
+        # split into parts
+        lisPairs = outMimic.split(" ")
+
+        # covert phonemes to visemes
+        visCodes = ''
+        for pair in lisPairs:
+            pho_dur = pair.split(":")
+            if len(pho_dur) != 2:
+                continue
+            visCodes += self.PhonemeToViseme(pho_dur[0]) + ":"
+            visCodes += pho_dur[1] + ","
+
+        # play WAV and walk thru visemes while it plays
+        enclosure.mouth_viseme(visCodes)
+        subprocess.call(['aplay', '/tmp/mimic.wav'])
+
+        # after speaking, blink 20% of the time
+        if (random.random() < 0.2):
+            enclosure.eyes_blink("b")
+
+        # delete WAV
+        os.remove("/tmp/mimic.wav")
 
 
 class MimicValidator(TTSValidator):

diff --git a/mycroft/tts/remote_tts.py b/mycroft/tts/remote_tts.py
@@ -44,7 +44,7 @@ def __init__(self, lang, voice, url, api_path):
         self.url = remove_last_slash(url)
         self.session = FuturesSession()
 
-    def execute(self, sentence):
+    def execute(self, sentence, client):
         phrases = self.__get_phrases(sentence)
 
         if len(phrases) > 0:

diff --git a/mycroft/tts/spdsay_tts.py b/mycroft/tts/spdsay_tts.py
@@ -29,7 +29,7 @@ class SpdSay(TTS):
     def __init__(self, lang, voice):
         super(SpdSay, self).__init__(lang, voice)
 
-    def execute(self, sentence):
+    def execute(self, sentence, client):
         subprocess.call(
             ['spd-say', '-l', self.lang, '-t', self.voice, sentence])