From 76b2680b6e2074e78392eb1e8a1f18a88a40a1f2 Mon Sep 17 00:00:00 2001 From: jonathan Date: Fri, 21 Jul 2023 13:43:54 +0000 Subject: [PATCH 1/4] feat: add gladia service --- .../GladiaTranscriptionService.java | 419 ++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100644 src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java diff --git a/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java new file mode 100644 index 000000000..337925d26 --- /dev/null +++ b/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java @@ -0,0 +1,419 @@ +/* + * Jigasi, the JItsi GAteway to SIP. + * + * Copyright @ 2023 - present 8x8, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.jitsi.jigasi.transcription; + +import org.eclipse.jetty.websocket.api.*; +import org.eclipse.jetty.websocket.api.annotations.*; +import org.eclipse.jetty.websocket.client.*; +import org.json.*; +import org.jitsi.jigasi.*; +import org.jitsi.utils.logging.*; + +import javax.media.format.*; +import java.io.*; +import java.net.*; +import java.nio.*; +import java.time.*; +import java.util.*; +import java.util.concurrent.*; +import java.util.function.*; +import com.google.gson.Gson; + + +/** + * Implements a TranscriptionService which uses Gladia transcription services + *

+ */ +public class GladiaTranscriptionService + implements TranscriptionService +{ + + /** + * The logger for this class + */ + private final static Logger logger + = Logger.getLogger(GladiaTranscriptionService.class); + + private static Gson gson = new Gson(); + + + /** + * The config key of the websocket to the speech-to-text service. + */ + public final static String WEBSOCKET_URL + = "org.jitsi.jigasi.transcription.gladia.websocket_url"; + + private final static String X_GLADIA_KEY = "org.jitsi.jigasi.transcription.gladia.api_key"; + + + public final static String DEFAULT_WEBSOCKET_URL = "wss://api.gladia.io/audio/text/audio-transcription"; + + private final static String EOF_MESSAGE = "{\"eof\" : 1}"; + + /** + * The config value of the websocket to the speech-to-text service. + */ + private String websocketUrlConfig; + + /** + * The URL of the websocket to the speech-to-text service. + */ + private String websocketUrl; + + private String apiKey; + /** + * Assigns the websocketUrl to use to websocketUrl by reading websocketUrlConfig; + */ + private void generateWebsocketUrl(Participant participant) + throws org.json.simple.parser.ParseException + { + websocketUrl = websocketUrlConfig; + return; + } + + /** + * Create a TranscriptionService which will send audio to the Gladia service + * platform to get a transcription + */ + public GladiaTranscriptionService() + { + websocketUrlConfig = JigasiBundleActivator.getConfigurationService() + .getString(WEBSOCKET_URL, DEFAULT_WEBSOCKET_URL); + logger.info("" + websocketUrlConfig); + apiKey= JigasiBundleActivator.getConfigurationService() + .getString(X_GLADIA_KEY); + } + + /** + * No configuration required yet + */ + public boolean isConfiguredProperly() + { + return true; + } + + /** + * If the websocket url is a JSON, language routing is supported + */ + public boolean supportsLanguageRouting() + { + return websocketUrlConfig.trim().startsWith("{"); + } + + /** + * Sends audio as an array of bytes to Gladia service + * + * @param request the TranscriptionRequest which holds the audio to be sent + * @param resultConsumer a Consumer which will handle the + * TranscriptionResult + */ + @Override + public void sendSingleRequest(final TranscriptionRequest request, + final Consumer resultConsumer) + { + // Try to create the client, which can throw an IOException + try + { + // Set the sampling rate and encoding of the audio + AudioFormat format = request.getFormat(); + if (!format.getEncoding().equals("LINEAR")) + { + throw new IllegalArgumentException("Given AudioFormat" + + "has unexpected" + + "encoding"); + } + Instant timeRequestReceived = Instant.now(); + + WebSocketClient ws = new WebSocketClient(); + GladiaWebsocketSession socket = new GladiaWebsocketSession(request); + ws.start(); + ws.connect(socket, new URI(websocketUrl)); + socket.awaitClose(); + resultConsumer.accept( + new TranscriptionResult( + null, + UUID.randomUUID(), + timeRequestReceived, + false, + request.getLocale().toLanguageTag(), + 0, + new TranscriptionAlternative(socket.getResult()))); + } + catch (Exception e) + { + logger.error("Error sending single req", e); + } + } + + @Override + public StreamingRecognitionSession initStreamingSession(Participant participant) + throws UnsupportedOperationException + { + try + { + generateWebsocketUrl(participant); + GladiaWebsocketStreamingSession streamingSession = new GladiaWebsocketStreamingSession( + participant.getDebugName()); + streamingSession.transcriptionTag = participant.getTranslationLanguage(); + if (streamingSession.transcriptionTag == null) + { + streamingSession.transcriptionTag = participant.getSourceLanguage(); + } + return streamingSession; + } + catch (Exception e) + { + throw new UnsupportedOperationException("Failed to create streaming session", e); + } + } + + @Override + public boolean supportsFragmentTranscription() + { + return true; + } + + @Override + public boolean supportsStreamRecognition() + { + return true; + } + + /** + * A Transcription session for transcribing streams, handles + * the lifecycle of websocket + */ + @WebSocket + public class GladiaWebsocketStreamingSession + implements StreamingRecognitionSession + { + private Session session; + /* The name of the participant */ + private final String debugName; + /* The sample rate of the audio stream we collect from the first request */ + private Integer sampleRate = -1; + /* Last returned result so we do not return the same string twice */ + private String lastResult = ""; + /* Transcription language requested by the user who requested the transcription */ + private String transcriptionTag = "en-US"; + + /** + * List of TranscriptionListeners which will be notified when a + * result comes in + */ + private final List listeners = new ArrayList<>(); + + /** + * Latest assigned UUID to a transcription result. + * A new one has to be generated whenever a definitive result is received. + */ + private UUID uuid = UUID.randomUUID(); + + GladiaWebsocketStreamingSession(String debugName) + throws Exception + { + this.debugName = debugName; + WebSocketClient ws = new WebSocketClient(); + ws.start(); + ws.connect(this, new URI(websocketUrl)); + } + + @OnWebSocketClose + public void onClose(int statusCode, String reason) + { + this.session = null; + } + + @OnWebSocketConnect + public void onConnect(Session session) + { + this.session = session; + } + + @OnWebSocketMessage + public void onMessage(String msg) + { + boolean partial = true; + String result = ""; + if (logger.isDebugEnabled()) + logger.debug(debugName + "Recieved response: " + msg); + JSONObject obj = new JSONObject(msg); + boolean hasType = obj.has("type"); + if (hasType) + { + String type = obj.getString("type"); + if (type.equals("final")) + { + partial = false; + } + result = obj.getString("transcription"); + } + + if (!result.isEmpty() && (!partial || !result.equals(lastResult))) + { + lastResult = result; + for (TranscriptionListener l : listeners) + { + l.notify(new TranscriptionResult( + null, + uuid, + // this time needs to be the one when the audio was sent + // the results need to be matched with the time when we sent the audio, so we have + // the real time when this transcription was started + Instant.now(), + partial, + transcriptionTag, + 1.0, + new TranscriptionAlternative(result))); + } + } + + if (!partial) + { + this.uuid = UUID.randomUUID(); + } + } + + @OnWebSocketError + public void onError(Throwable cause) + { + logger.error("Error while streaming audio data to transcription service" , cause); + } + + public void sendRequest(TranscriptionRequest request) + { + try + { + if (sampleRate < 0) + { + sampleRate = (int) request.getFormat().getSampleRate(); + } + ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio()); + String encodedAudioData = Base64.getEncoder().encodeToString(audioBuffer.array()); + Map message = new HashMap<>(); + message.put("x_gladia_key", apiKey); + message.put("sample_rate", sampleRate); + message.put("frames", encodedAudioData); + // message.put("reinject_context", "true"); + // message.put("language", "english"); + session.getRemote().sendString(gson.toJson(message)); + } + catch (Exception e) + { + logger.error("Error to send websocket request for participant " + debugName, e); + } + } + + public void addTranscriptionListener(TranscriptionListener listener) + { + listeners.add(listener); + } + + public void end() + { + try + { + session.getRemote().sendString(EOF_MESSAGE); + } + catch (Exception e) + { + logger.error("Error to finalize websocket connection for participant " + debugName, e); + } + } + + public boolean ended() + { + return session == null; + } + } + + /** + * Session to send websocket data and recieve results. Non-streaming version + */ + @WebSocket + public class GladiaWebsocketSession + { + /* Signal for the end of operation */ + private final CountDownLatch closeLatch; + + /* Request we need to process */ + private final TranscriptionRequest request; + + /* Collect results*/ + private StringBuilder result; + + GladiaWebsocketSession(TranscriptionRequest request) + { + this.closeLatch = new CountDownLatch(1); + this.request = request; + this.result = new StringBuilder(); + } + + @OnWebSocketClose + public void onClose(int statusCode, String reason) + { + this.closeLatch.countDown(); // trigger latch + } + + @OnWebSocketConnect + public void onConnect(Session session) + { + try + { + AudioFormat format = request.getFormat(); + ByteBuffer audioBuffer = ByteBuffer.wrap(request.getAudio()); + String encodedAudioData = Base64.getEncoder().encodeToString(audioBuffer.array()); + Map message = new HashMap<>(); + message.put("x_gladia_key", apiKey); + message.put("sample_rate", (int) format.getSampleRate()); + message.put("frames", encodedAudioData); + + session.getRemote().sendString(gson.toJson(message)); + } + catch (IOException e) + { + logger.error("Error to transcribe audio", e); + } + } + + @OnWebSocketMessage + public void onMessage(String msg) + { + result.append(msg); + result.append('\n'); + } + + @OnWebSocketError + public void onError(Throwable cause) + { + logger.error("Websocket connection error", cause); + } + + public String getResult() + { + return result.toString(); + } + + void awaitClose() + throws InterruptedException + { + closeLatch.await(); + } + } + +} From 66931bc37495329c058299f6fc28549eafdb0414 Mon Sep 17 00:00:00 2001 From: jonathan Date: Tue, 12 Sep 2023 18:59:19 +0000 Subject: [PATCH 2/4] feat: gladia logger --- .../GladiaTranscriptionService.java | 66 +++++++++++++------ 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java index 337925d26..b1c99fe09 100644 --- a/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java +++ b/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java @@ -1,7 +1,7 @@ /* * Jigasi, the JItsi GAteway to SIP. * - * Copyright @ 2023 - present 8x8, Inc. + * Copyright @ 2018 - present 8x8, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,8 +36,15 @@ /** - * Implements a TranscriptionService which uses Gladia transcription services + * Implements a TranscriptionService which uses local + * Gladia websocket transcription service. *

+ * See https://github.com/alphacep/Gladia-server for + * information about server + * + * @author Nik Vaessen + * @author Damian Minkov + * @author Nickolay V. Shmyrev */ public class GladiaTranscriptionService implements TranscriptionService @@ -51,6 +58,13 @@ public class GladiaTranscriptionService private static Gson gson = new Gson(); + public final static String EMPTY_STRING = ""; + + public final static String TRANSCRIPTION_KEY = "transcription"; + + public final static String TYPE_KEY = "type"; + + public final static String FINAL_KEY = "final"; /** * The config key of the websocket to the speech-to-text service. @@ -208,7 +222,7 @@ public class GladiaWebsocketStreamingSession /* The sample rate of the audio stream we collect from the first request */ private Integer sampleRate = -1; /* Last returned result so we do not return the same string twice */ - private String lastResult = ""; + private String lastTranscription = ""; /* Transcription language requested by the user who requested the transcription */ private String transcriptionTag = "en-US"; @@ -248,25 +262,34 @@ public void onConnect(Session session) @OnWebSocketMessage public void onMessage(String msg) { - boolean partial = true; - String result = ""; - if (logger.isDebugEnabled()) - logger.debug(debugName + "Recieved response: " + msg); + // log json message + logger.info(debugName + "on message: " + msg); + + // create the json object JSONObject obj = new JSONObject(msg); - boolean hasType = obj.has("type"); - if (hasType) + + // retrieve the transcription of the utterance + boolean hasTranscription = obj.has(TRANSCRIPTION_KEY); + String transcription = null; + if (hasTranscription) { - String type = obj.getString("type"); - if (type.equals("final")) - { - partial = false; - } - result = obj.getString("transcription"); - } + transcription = obj.getString(TRANSCRIPTION_KEY); + } + + + // retrieve the type of utterance + Boolean partial = true; + boolean hasType = obj.has(TYPE_KEY); + if (hasType && obj.getString(TYPE_KEY) == FINAL_KEY) + { + partial = false; + } - if (!result.isEmpty() && (!partial || !result.equals(lastResult))) + // notify the listeners + if (transcription != null && (!partial || !transcription.equals(lastTranscription))) { - lastResult = result; + logger.info("transcription: " + transcription); + lastTranscription = transcription; for (TranscriptionListener l : listeners) { l.notify(new TranscriptionResult( @@ -279,10 +302,11 @@ public void onMessage(String msg) partial, transcriptionTag, 1.0, - new TranscriptionAlternative(result))); + new TranscriptionAlternative(transcription))); } } + // if final, renew the id if (!partial) { this.uuid = UUID.randomUUID(); @@ -299,6 +323,7 @@ public void sendRequest(TranscriptionRequest request) { try { + // actual code that sends the audio data if (sampleRate < 0) { sampleRate = (int) request.getFormat().getSampleRate(); @@ -309,8 +334,7 @@ public void sendRequest(TranscriptionRequest request) message.put("x_gladia_key", apiKey); message.put("sample_rate", sampleRate); message.put("frames", encodedAudioData); - // message.put("reinject_context", "true"); - // message.put("language", "english"); + message.put("reinject_context", "true"); session.getRemote().sendString(gson.toJson(message)); } catch (Exception e) From cea3f0bc2a619d6a8569527cc53e9704b392a0aa Mon Sep 17 00:00:00 2001 From: jonathan Date: Thu, 14 Sep 2023 12:51:02 +0000 Subject: [PATCH 3/4] fix: toto --- .../GladiaTranscriptionService.java | 62 +++++++++++++------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java index b1c99fe09..1d4d7ab1a 100644 --- a/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java +++ b/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java @@ -64,8 +64,12 @@ public class GladiaTranscriptionService public final static String TYPE_KEY = "type"; + public final static String TYPE_DURATION = "duration"; + public final static String FINAL_KEY = "final"; + public final static float MINIMUM_AUDIO_DURATION = 3; + /** * The config key of the websocket to the speech-to-text service. */ @@ -276,7 +280,6 @@ public void onMessage(String msg) transcription = obj.getString(TRANSCRIPTION_KEY); } - // retrieve the type of utterance Boolean partial = true; boolean hasType = obj.has(TYPE_KEY); @@ -284,26 +287,49 @@ public void onMessage(String msg) { partial = false; } + + // retrieve the duration of utterance + float duration = 0; + boolean hasDuration = obj.has(TYPE_DURATION); + if (hasDuration) + { + duration = obj.getFloat(TYPE_DURATION); + } + // no transcription, no work + if (!hasTranscription) + { + return; + } + + // same transcription, no work + if (partial && transcription.equals(lastTranscription)) + { + return; + } + + // partial less than 3s, no work + if (partial && duration > MINIMUM_AUDIO_DURATION) + { + return; + } + // notify the listeners - if (transcription != null && (!partial || !transcription.equals(lastTranscription))) + logger.info("transcription: " + transcription); + lastTranscription = transcription; + for (TranscriptionListener l : listeners) { - logger.info("transcription: " + transcription); - lastTranscription = transcription; - for (TranscriptionListener l : listeners) - { - l.notify(new TranscriptionResult( - null, - uuid, - // this time needs to be the one when the audio was sent - // the results need to be matched with the time when we sent the audio, so we have - // the real time when this transcription was started - Instant.now(), - partial, - transcriptionTag, - 1.0, - new TranscriptionAlternative(transcription))); - } + l.notify(new TranscriptionResult( + null, + uuid, + // this time needs to be the one when the audio was sent + // the results need to be matched with the time when we sent the audio, so we have + // the real time when this transcription was started + Instant.now(), + partial, + transcriptionTag, + 1.0, + new TranscriptionAlternative(transcription))); } // if final, renew the id From 9513300dbcba84ce83d8a99ffed0a037fd3a971a Mon Sep 17 00:00:00 2001 From: jonathan Date: Thu, 14 Sep 2023 13:13:14 +0000 Subject: [PATCH 4/4] fix: toto --- .../jitsi/jigasi/transcription/GladiaTranscriptionService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java index 1d4d7ab1a..fa9614e56 100644 --- a/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java +++ b/src/main/java/org/jitsi/jigasi/transcription/GladiaTranscriptionService.java @@ -309,7 +309,7 @@ public void onMessage(String msg) } // partial less than 3s, no work - if (partial && duration > MINIMUM_AUDIO_DURATION) + if (partial && duration < MINIMUM_AUDIO_DURATION) { return; }