Support setting voice inside the TTS stream (#8)

* Support setting voice inside the TTS stream This change supports switching voice on the fly according to STREAMING_SET_VOICE event. This sets a local variable to the event value. If there is no such value (""), TTS voice falls back to default voice. * Provide type for external events (i/o) SpeechState * version 2.4.0
vladmaraev · Aug 16, 2024 · be7eed4 · be7eed4
1 parent e268656
commit be7eed4
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 28 deletions.
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "speechstate",
-  "version": "2.3.0",
+  "version": "2.4.0",
   "license": "GPL-3.0",
   "homepage": "http://localhost/speechstate",
   "main": "./dist/index.js",

diff --git a/src/index.ts b/src/index.ts
@@ -6,3 +6,5 @@ export {
   Agenda,
   RecogniseParameters,
 } from "./types";
+
+export type { SpeechStateExternalEvent } from "./types";
diff --git a/src/speechstate.ts b/src/speechstate.ts
@@ -8,44 +8,20 @@ import {
 import { ttsMachine } from "./tts";
 import { asrMachine } from "./asr";
 
-import { Settings, Agenda, Hypothesis, RecogniseParameters } from "./types";
+import { Settings, SpeechStateEvent } from "./types";
 interface SSContext {
   settings: Settings;
   audioContext?: AudioContext;
   asrRef?: any;
   ttsRef?: any;
 }
 
-/** events sent to the spawned `speechstate` machine **/
-type SSEventExtIn =
-  | { type: "PREPARE" }
-  | { type: "CONTROL" }
-  | { type: "STOP" }
-  | { type: "SPEAK"; value: Agenda }
-  | { type: "LISTEN"; value: RecogniseParameters };
-
-/** for sendParent, not type-checked */
-type SSEventExtOut =
-  | { type: "ASR_NOINPUT" }
-  | { type: "ASRTTS_READY" }
-  | { type: "ASR_STARTED" }
-  | { type: "TTS_STARTED" }
-  | { type: "SPEAK_COMPLETE" }
-  | { type: "RECOGNISED"; value: Hypothesis[]; nluValue?: any };
-
-type SSEventIntIn =
-  | { type: "TTS_READY" }
-  | { type: "ASR_READY" }
-  | { type: "TTS_ERROR" };
-
-type SSEvent = SSEventIntIn | SSEventExtIn | SSEventExtOut;
-
 const speechstate = createMachine(
   {
     types: {} as {
       input: Settings;
       context: SSContext;
-      events: SSEvent;
+      events: SpeechStateEvent;
     },
     context: ({ input }) => ({
       settings: input,

diff --git a/src/tts.ts b/src/tts.ts
@@ -24,6 +24,7 @@ interface TTSContext extends TTSInit {
   wsaUtt?: MySpeechSynthesisUtterance;
   agenda?: Agenda;
   buffer?: string;
+  currentVoice?: string;
   utteranceFromStream?: string;
 }
 
@@ -48,6 +49,7 @@ type TTSEvent =
   | { type: "SPEAK"; value: Agenda }
   | { type: "TTS_STARTED" }
   | { type: "STREAMING_CHUNK"; value: string }
+  | { type: "STREAMING_SET_VOICE"; value: string }
   | { type: "STREAMING_DONE" }
   | { type: "SPEAK_COMPLETE" };
 
@@ -72,6 +74,17 @@ export const ttsMachine = setup({
           context.buffer.substring(spaceIndex),
       };
     }),
+    assignCurrentVoice: assign(
+      ({
+        event,
+      }: {
+        event: { type: "STREAMING_SET_VOICE"; value: string };
+      }) => {
+        return {
+          currentVoice: event.value,
+        };
+      },
+    ),
   },
   actors: {
     getToken: getToken,
@@ -90,6 +103,10 @@ export const ttsMachine = setup({
           console.log("received streaming chunk:", event);
           sendBack({ type: "STREAMING_CHUNK", value: event.data });
         });
+        eventSource.addEventListener("STREAMING_SET_VOICE", (event) => {
+          console.log("received streaming voice set command:", event);
+          sendBack({ type: "STREAMING_SET_VOICE", value: event.data });
+        });
       },
     ),
     ponyfill: fromCallback<null, TTSPonyfillInput>(({ sendBack, input }) => {
@@ -228,6 +245,11 @@ export const ttsMachine = setup({
           states: {
             Buffer: {
               initial: "BufferIdle",
+              on: {
+                STREAMING_SET_VOICE: {
+                  actions: "assignCurrentVoice",
+                },
+              },
               states: {
                 BufferIdle: {
                   id: "BufferIdle",
@@ -389,7 +411,9 @@ export const ttsMachine = setup({
                           wsaUtt: context.wsaUtt,
                           ttsLexicon: context.ttsLexicon,
                           voice:
-                            context.agenda.voice || context.ttsDefaultVoice,
+                            context.currentVoice ||
+                            context.agenda.voice ||
+                            context.ttsDefaultVoice,
                           utterance: context.utteranceFromStream,
                         }),
                       },

diff --git a/src/types.ts b/src/types.ts
@@ -46,3 +46,28 @@ export interface RecogniseParameters {
   hints?: string[];
   nlu?: boolean | AzureLanguageCredentials;
 }
+
+/** events sent to the spawned `speechstate` machine **/
+type SSEventExtIn =
+  | { type: "PREPARE" }
+  | { type: "CONTROL" }
+  | { type: "STOP" }
+  | { type: "SPEAK"; value: Agenda }
+  | { type: "LISTEN"; value: RecogniseParameters };
+
+/** for sendParent, not type-checked */
+type SSEventExtOut =
+  | { type: "ASR_NOINPUT" }
+  | { type: "ASRTTS_READY" }
+  | { type: "ASR_STARTED" }
+  | { type: "TTS_STARTED" }
+  | { type: "SPEAK_COMPLETE" }
+  | { type: "RECOGNISED"; value: Hypothesis[]; nluValue?: any };
+
+type SSEventIntIn =
+  | { type: "TTS_READY" }
+  | { type: "ASR_READY" }
+  | { type: "TTS_ERROR" };
+
+export type SpeechStateExternalEvent = SSEventExtIn | SSEventExtOut;
+export type SpeechStateEvent = SSEventIntIn | SpeechStateExternalEvent;