From 10107d482e3dd41057c5c9c930f6710aabf674c5 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Fri, 13 Sep 2024 21:06:49 -0400
Subject: [PATCH 01/20] chore: Update ONNX Runtime version to 1.19.2 and adjust
 corresponding hashes

---
 cmake/FetchOnnxruntime.cmake | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cmake/FetchOnnxruntime.cmake b/cmake/FetchOnnxruntime.cmake
index 0ed2975..940bada 100644
--- a/cmake/FetchOnnxruntime.cmake
+++ b/cmake/FetchOnnxruntime.cmake
@@ -8,7 +8,7 @@ set(CUSTOM_ONNXRUNTIME_HASH
     ""
     CACHE STRING "Hash of a downloaded ONNX Runtime tarball")
 
-set(Onnxruntime_VERSION "1.17.1")
+set(Onnxruntime_VERSION "1.19.2")
 
 if(CUSTOM_ONNXRUNTIME_URL STREQUAL "")
   set(USE_PREDEFINED_ONNXRUNTIME ON)
@@ -25,17 +25,17 @@ if(USE_PREDEFINED_ONNXRUNTIME)
 
   if(APPLE)
     set(Onnxruntime_URL "${Onnxruntime_BASEURL}/onnxruntime-osx-universal2-${Onnxruntime_VERSION}.tgz")
-    set(Onnxruntime_HASH SHA256=9FA57FA6F202A373599377EF75064AE568FDA8DA838632B26A86024C7378D306)
+    set(Onnxruntime_HASH SHA256=b0289ddbc32f76e5d385abc7b74cc7c2c51cdf2285b7d118bf9d71206e5aee3a)
   elseif(MSVC)
     set(Onnxruntime_URL "${Onnxruntime_BASEURL}/onnxruntime-win-x64-${Onnxruntime_VERSION}.zip")
-    set(OOnnxruntime_HASH SHA256=4802AF9598DB02153D7DA39432A48823FF69B2FB4B59155461937F20782AA91C)
+    set(OOnnxruntime_HASH SHA256=dc4f841e511977c0a4f02e5066c3d9a58427644010ab4f89b918614a1cd4c2b0)
   else()
     if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
       set(Onnxruntime_URL "${Onnxruntime_BASEURL}/onnxruntime-linux-aarch64-${Onnxruntime_VERSION}.tgz")
-      set(Onnxruntime_HASH SHA256=70B6F536BB7AB5961D128E9DBD192368AC1513BFFB74FE92F97AAC342FBD0AC1)
+      set(Onnxruntime_HASH SHA256=dc4f841e511977c0a4f02e5066c3d9a58427644010ab4f89b918614a1cd4c2b0)
     else()
       set(Onnxruntime_URL "${Onnxruntime_BASEURL}/onnxruntime-linux-x64-gpu-${Onnxruntime_VERSION}.tgz")
-      set(Onnxruntime_HASH SHA256=613C53745EA4960ED368F6B3AB673558BB8561C84A8FA781B4EA7FB4A4340BE4)
+      set(Onnxruntime_HASH SHA256=4d1c10f0b410b67261302c6e18bb1b05ba924ca9081e3a26959e0d12ab69f534)
     endif()
   endif()
 else()

From 6a5e1e7b787cc49dc7a84b9a018275c2bd9354b0 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Tue, 1 Oct 2024 02:24:45 -0400
Subject: [PATCH 02/20] refactor: Add stenographer options and resample utils

This commit adds the following changes:
- Added a new section for stenographer options in the filter properties
- Implemented resample utilities for handling audio data

These changes enable the use of stenographer functionality and provide support for resampling audio data.
---
 CMakeLists.txt                               |  24 +-
 data/locale/en-US.ini                        |   3 +-
 src/stenographer/stenographer.cpp            | 220 ++++++++++++++++++
 src/stenographer/stenographer.h              |  33 +++
 src/stenographer/stenographer_interface.html | 223 +++++++++++++++++++
 src/transcription-filter-callbacks.cpp       |   6 +-
 src/transcription-filter-data.h              |   4 +
 src/transcription-filter-properties.cpp      |   9 +
 src/transcription-filter.cpp                 |  45 +++-
 src/transcription-utils.cpp                  |  46 ++--
 src/whisper-utils/resample-utils.cpp         |  98 ++++++++
 src/whisper-utils/resample-utils.h           |  10 +
 src/whisper-utils/vad-processing.cpp         |  96 +-------
 13 files changed, 702 insertions(+), 115 deletions(-)
 create mode 100644 src/stenographer/stenographer.cpp
 create mode 100644 src/stenographer/stenographer.h
 create mode 100644 src/stenographer/stenographer_interface.html
 create mode 100644 src/whisper-utils/resample-utils.cpp
 create mode 100644 src/whisper-utils/resample-utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9233158..f29aee0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -101,6 +101,26 @@ include(cmake/BuildICU.cmake)
 target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
 target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})
 
+if(NOT buildspec)
+file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec)
+endif()
+string(
+  JSON
+  version
+  GET
+  ${buildspec}
+  dependencies
+  prebuilt
+  version)
+if(MSVC)
+set(arch ${CMAKE_GENERATOR_PLATFORM})
+elseif(APPLE)
+set(arch universal)
+endif()
+set(deps_root "${CMAKE_CURRENT_SOURCE_DIR}/.deps/obs-deps-${version}-${arch}")
+message(STATUS "deps_root: ${deps_root}")
+target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE "${deps_root}/include")
+
 target_sources(
   ${CMAKE_PROJECT_NAME}
   PRIVATE src/plugin-main.c
@@ -120,12 +140,14 @@ target_sources(
           src/whisper-utils/silero-vad-onnx.cpp
           src/whisper-utils/token-buffer-thread.cpp
           src/whisper-utils/vad-processing.cpp
+          src/whisper-utils/resample-utils.cpp
           src/translation/language_codes.cpp
           src/translation/translation.cpp
           src/translation/translation-utils.cpp
           src/ui/filter-replace-utils.cpp
           src/translation/translation-language-utils.cpp
-          src/ui/filter-replace-dialog.cpp)
+          src/ui/filter-replace-dialog.cpp
+          src/stenographer/stenographer.cpp)
 
 set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
 
diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
index 9ef4d18..94eb613 100644
--- a/data/locale/en-US.ini
+++ b/data/locale/en-US.ini
@@ -87,4 +87,5 @@ Active_VAD="Active VAD"
 Hybrid_VAD="Hybrid VAD"
 translate_only_full_sentences="Translate only full sentences"
 duration_filter_threshold="Duration filter"
-segment_duration="Segment duration"
\ No newline at end of file
+segment_duration="Segment duration"
+stenographer_parameters="Stenographer Options"
diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp
new file mode 100644
index 0000000..b8af2c6
--- /dev/null
+++ b/src/stenographer/stenographer.cpp
@@ -0,0 +1,220 @@
+#include <util/base.h>
+
+#include "stenographer.h"
+#include "plugin-support.h"
+#include "whisper-utils/resample-utils.h"
+#include "transcription-utils.h"
+
+#define ASIO_STANDALONE
+#define _WEBSOCKETPP_CPP11_TYPE_TRAITS_
+
+#include <websocketpp/config/asio_no_tls.hpp>
+#include <websocketpp/server.hpp>
+#include <nlohmann/json.hpp>
+#include <queue>
+#include <mutex>
+#include <future>
+#include <thread>
+
+using json = nlohmann::json;
+typedef websocketpp::server<websocketpp::config::asio> server;
+
+// WAV header structure
+struct WAVHeader {
+	char riff[4] = {'R', 'I', 'F', 'F'};
+	uint32_t overall_size;
+	char wave[4] = {'W', 'A', 'V', 'E'};
+	char fmt_chunk_marker[4] = {'f', 'm', 't', ' '};
+	uint32_t length_of_fmt = 16;
+	uint16_t format_type = 1;
+	uint16_t channels = 1;
+	uint32_t sample_rate = 16000;
+	uint32_t byterate;
+	uint16_t block_align;
+	uint16_t bits_per_sample = 16;
+	char data_chunk_header[4] = {'d', 'a', 't', 'a'};
+	uint32_t data_size;
+};
+
+class TranscriptionHandler::Impl {
+public:
+	using MessageCallback = TranscriptionHandler::MessageCallback;
+
+	explicit Impl(transcription_filter_data *gf_, MessageCallback callback)
+		: gf(gf_),
+		  messageCallback(callback),
+		  running(false)
+	{
+		server.init_asio();
+
+		server.set_open_handler([this](websocketpp::connection_hdl hdl) {
+			std::lock_guard<std::mutex> lock(mutex);
+			connection = hdl;
+		});
+
+		server.set_message_handler(
+			[this](websocketpp::connection_hdl hdl, server::message_ptr msg) {
+				handleIncomingMessage(msg->get_payload());
+			});
+
+		// Initialize WAV header
+		wavHeader.byterate =
+			wavHeader.sample_rate * wavHeader.channels * wavHeader.bits_per_sample / 8;
+		wavHeader.block_align = wavHeader.channels * wavHeader.bits_per_sample / 8;
+	}
+
+	void start()
+	{
+		if (!running) {
+			running = true;
+			serverThread = std::async(std::launch::async, [this]() {
+				server.listen(9002);
+				server.start_accept();
+				server.run();
+			});
+
+			processingThread =
+				std::async(std::launch::async, [this]() { processAudioQueue(); });
+		}
+	}
+
+	void stop()
+	{
+		if (running) {
+			running = false;
+			server.stop();
+			if (serverThread.valid())
+				serverThread.wait();
+			if (processingThread.valid())
+				processingThread.wait();
+		}
+	}
+
+private:
+	transcription_filter_data *gf;
+	server server;
+	websocketpp::connection_hdl connection;
+	MessageCallback messageCallback;
+	std::queue<std::vector<int16_t>> audioQueue;
+	std::mutex mutex;
+	std::atomic<bool> running;
+	std::future<void> serverThread;
+	std::future<void> processingThread;
+
+	void handleIncomingMessage(const std::string &message)
+	{
+		try {
+			json j = json::parse(message);
+			std::string type = j["type"].get<std::string>();
+			std::string text = j["text"].get<std::string>();
+
+			uint64_t start_timestamp = j["start_timestamp"].get<uint64_t>();
+			uint64_t end_timestamp = j["end_timestamp"].get<uint64_t>();
+
+			messageCallback(type, text, start_timestamp, end_timestamp);
+		} catch (json::parse_error &e) {
+			obs_log(LOG_ERROR, "Failed to parse JSON message: %s", e.what());
+		} catch (json::type_error &e) {
+			obs_log(LOG_ERROR, "Failed to parse JSON message: %s", e.what());
+		}
+	}
+
+	void processAudioQueue()
+	{
+		while (running) {
+			// get data from buffer and resample
+			uint64_t start_timestamp_offset_ns = 0;
+			uint64_t end_timestamp_offset_ns = 0;
+
+			const int ret = get_data_from_buf_and_resample(
+				gf, start_timestamp_offset_ns, end_timestamp_offset_ns);
+			if (ret != 0) {
+				std::this_thread::sleep_for(std::chrono::milliseconds(10));
+				continue;
+			}
+
+			std::vector<float> audio_input;
+			audio_input.resize(gf->resampled_buffer.size / sizeof(float));
+			circlebuf_pop_front(&gf->resampled_buffer, audio_input.data(),
+					    audio_input.size() * sizeof(float));
+
+			std::vector<int16_t> pcmData(audio_input.size());
+			for (size_t i = 0; i < audio_input.size(); ++i) {
+				pcmData[i] = static_cast<int16_t>(audio_input[i] * 32767.0f);
+			}
+
+			if (!pcmData.empty()) {
+				json timestampInfo = {{"start_timestamp",
+						       start_timestamp_offset_ns},
+						      {"end_timestamp", end_timestamp_offset_ns}};
+				if (connection.lock()) {
+					server.send(connection, timestampInfo.dump(),
+						    websocketpp::frame::opcode::text);
+				}
+				sendAudioData(pcmData);
+			} else {
+				std::this_thread::sleep_for(std::chrono::milliseconds(10));
+			}
+
+			if (!gf->cleared_last_sub) {
+				// check if we should clear the current sub depending on the minimum subtitle duration
+				uint64_t now = now_ms();
+				if ((now - gf->last_sub_render_time) > gf->max_sub_duration) {
+					// clear the current sub, call the callback with an empty string
+					obs_log(gf->log_level,
+						"Clearing current subtitle. now: %lu ms, last: %lu ms",
+						now, gf->last_sub_render_time);
+					clear_current_caption(gf);
+				}
+			}
+		}
+	}
+
+	WAVHeader wavHeader;
+	std::vector<int16_t> audioBuffer;
+
+	void sendAudioData(const std::vector<int16_t> &audioData)
+	{
+		std::lock_guard<std::mutex> lock(mutex);
+		if (connection.lock()) {
+			audioBuffer.insert(audioBuffer.end(), audioData.begin(), audioData.end());
+
+			// If we have accumulated enough data, send it as a WAV file
+			if (audioBuffer.size() >= 8000) { // 0.5 seconds of audio at 16kHz
+				wavHeader.data_size = audioBuffer.size() * sizeof(int16_t);
+				wavHeader.overall_size =
+					wavHeader.data_size + sizeof(WAVHeader) - 8;
+
+				std::vector<uint8_t> wavData(sizeof(WAVHeader) +
+							     wavHeader.data_size);
+				std::memcpy(wavData.data(), &wavHeader, sizeof(WAVHeader));
+				std::memcpy(wavData.data() + sizeof(WAVHeader), audioBuffer.data(),
+					    wavHeader.data_size);
+
+				server.send(connection, wavData.data(), wavData.size(),
+					    websocketpp::frame::opcode::binary);
+
+				audioBuffer.clear();
+			}
+		}
+	}
+};
+
+TranscriptionHandler::TranscriptionHandler(transcription_filter_data *gf_, MessageCallback callback)
+	: pimpl(std::make_unique<Impl>(std::move(gf_), std::move(callback)))
+{
+}
+
+TranscriptionHandler::~TranscriptionHandler() = default;
+
+TranscriptionHandler::TranscriptionHandler(TranscriptionHandler &&) noexcept = default;
+TranscriptionHandler &TranscriptionHandler::operator=(TranscriptionHandler &&) noexcept = default;
+
+void TranscriptionHandler::start()
+{
+	pimpl->start();
+}
+void TranscriptionHandler::stop()
+{
+	pimpl->stop();
+}
\ No newline at end of file
diff --git a/src/stenographer/stenographer.h b/src/stenographer/stenographer.h
new file mode 100644
index 0000000..cb7a26e
--- /dev/null
+++ b/src/stenographer/stenographer.h
@@ -0,0 +1,33 @@
+#pragma once
+
+// Forward declaration
+struct transcription_filter_data;
+
+#include <functional>
+#include <memory>
+#include <string>
+
+class TranscriptionHandler {
+public:
+	using MessageCallback =
+		std::function<void(const std::string &type, const std::string &text,
+				   uint64_t start_timestamp, uint64_t end_timestamp)>;
+
+	explicit TranscriptionHandler(transcription_filter_data *gf_, MessageCallback callback);
+	~TranscriptionHandler();
+
+	// Disable copy
+	TranscriptionHandler(const TranscriptionHandler &) = delete;
+	TranscriptionHandler &operator=(const TranscriptionHandler &) = delete;
+
+	// Enable move
+	TranscriptionHandler(TranscriptionHandler &&) noexcept;
+	TranscriptionHandler &operator=(TranscriptionHandler &&) noexcept;
+
+	void start();
+	void stop();
+
+private:
+	class Impl;
+	std::unique_ptr<Impl> pimpl;
+};
\ No newline at end of file
diff --git a/src/stenographer/stenographer_interface.html b/src/stenographer/stenographer_interface.html
new file mode 100644
index 0000000..0b5eea8
--- /dev/null
+++ b/src/stenographer/stenographer_interface.html
@@ -0,0 +1,223 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Stenographer Interface</title>
+    <style>
+        body { font-family: Arial, sans-serif; }
+        #captionInput { width: 100%; height: 100px; }
+        #visualizer { width: 100%; height: 100px; background-color: #f0f0f0; }
+        .status-connected { color: green; }
+        .status-disconnected { color: red; }
+        .status-connecting { color: orange; }
+    </style>
+</head>
+<body>
+    <h1>Stenographer Interface</h1>
+    <button id="startButton">Start Audio</button>
+    <canvas id="visualizer"></canvas>
+    <textarea id="captionInput" placeholder="Enter captions here..."></textarea>
+    <div id="connectionStatus">Connection Status: <span id="statusText" class="status-disconnected">Disconnected</span></div>
+    <div id="audioStatus">Audio Status: Not started</div>
+
+    <script>
+        const WS_URL = 'ws://localhost:9002';
+        let ws;
+        const captionInput = document.getElementById('captionInput');
+        const audioStatus = document.getElementById('audioStatus');
+        const connectionStatus = document.getElementById('statusText');
+        const startButton = document.getElementById('startButton');
+        const visualizer = document.getElementById('visualizer');
+        const visualizerContext = visualizer.getContext('2d');
+        let audioContext;
+        let audioQueue = [];
+        let isAudioInitialized = false;
+        let analyser;
+        let reconnectInterval = 1000; // Start with 1 second interval
+        const MAX_RECONNECT_INTERVAL = 30000; // Max 30 seconds
+
+        // Set up the canvas
+        visualizer.width = visualizer.offsetWidth;
+        visualizer.height = visualizer.offsetHeight;
+
+        let currentStartTimestamp = 0;
+
+        function connectWebSocket() {
+            ws = new WebSocket(WS_URL);
+
+            ws.onopen = () => {
+                console.log('WebSocket connection established');
+                updateConnectionStatus('Connected');
+                reconnectInterval = 1000; // Reset reconnect interval on successful connection
+            };
+
+            ws.onmessage = (event) => {
+                if (event.data instanceof Blob) {
+                    if (isAudioInitialized) {
+                        handleAudioData(event.data);
+                    } else {
+                        audioQueue.push(event.data);
+                    }
+                } else {
+                    // This is our timestamp information
+                    try {
+                        const timestampInfo = JSON.parse(event.data);
+                        currentStartTimestamp = timestampInfo.start_timestamp;
+                    } catch (error) {
+                        console.error('Error parsing timestamp information:', error);
+                    }
+                }
+            };
+
+            ws.onclose = (event) => {
+                console.log('WebSocket connection closed:', event.code, event.reason);
+                updateConnectionStatus('Disconnected');
+                scheduleReconnection();
+            };
+
+            ws.onerror = (error) => {
+                console.error('WebSocket error:', error);
+                updateConnectionStatus('Error');
+            };
+        }
+
+        function updateConnectionStatus(status) {
+            connectionStatus.textContent = status;
+            connectionStatus.className = `status-${status.toLowerCase()}`;
+            if (status === 'Disconnected' || status === 'Error') {
+                startButton.disabled = true;
+                audioStatus.textContent = 'Audio Status: Stopped (Connection lost)';
+            } else if (status === 'Connected' && !isAudioInitialized) {
+                startButton.disabled = false;
+                audioStatus.textContent = 'Audio Status: Ready (Click "Start Audio" to begin)';
+            }
+        }
+
+        function scheduleReconnection() {
+            console.log(`Attempting to reconnect in ${reconnectInterval / 1000} seconds`);
+            updateConnectionStatus('Connecting');
+            setTimeout(() => {
+                connectWebSocket();
+            }, reconnectInterval);
+
+            // Exponential backoff for reconnection attempts
+            reconnectInterval = Math.min(reconnectInterval * 2, MAX_RECONNECT_INTERVAL);
+        }
+
+        startButton.addEventListener('click', initializeAudio);
+
+        function initializeAudio() {
+            if (!audioContext && ws.readyState === WebSocket.OPEN) {
+                audioContext = new (window.AudioContext || window.webkitAudioContext)();
+                analyser = audioContext.createAnalyser();
+                analyser.fftSize = 256;
+                audioContext.resume().then(() => {
+                    isAudioInitialized = true;
+                    audioStatus.textContent = 'Audio Status: Playing';
+                    startButton.disabled = true;
+                    processAudioQueue();
+                    visualizeAudio();
+                });
+            }
+        }
+
+        function processAudioQueue() {
+            while (audioQueue.length > 0) {
+                handleAudioData(audioQueue.shift());
+            }
+        }
+
+        captionInput.addEventListener('input', () => {
+            sendCaptionUpdate('partial');
+        });
+
+        captionInput.addEventListener('keydown', (event) => {
+            if (event.key === 'Enter') {
+                event.preventDefault();
+                sendCaptionUpdate('sentence');
+                captionInput.value = '';
+            }
+        });
+
+        function sendCaptionUpdate(type) {
+            if (ws.readyState === WebSocket.OPEN) {
+                const now = Date.now(); // Keep in milliseconds
+                const message = JSON.stringify({
+                    type: type,
+                    text: captionInput.value,
+                    start_timestamp: currentStartTimestamp,
+                    end_timestamp: type === 'sentence' ? now : 0 // 0 for partial sentences
+                });
+                ws.send(message);
+                
+                if (type === 'sentence') {
+                    currentStartTimestamp = now; // Reset start timestamp for next sentence
+                }
+            } else {
+                console.warn('Cannot send caption update: WebSocket is not connected');
+            }
+        }
+
+        async function handleAudioData(blob) {
+            const arrayBuffer = await blob.arrayBuffer();
+            audioContext.decodeAudioData(arrayBuffer, (audioBuffer) => {
+                playAudio(audioBuffer);
+            }, (error) => {
+                console.error('Error decoding audio data:', error);
+            });
+        }
+
+        function playAudio(audioBuffer) {
+            const source = audioContext.createBufferSource();
+            source.buffer = audioBuffer;
+            source.connect(analyser);
+            analyser.connect(audioContext.destination);
+            source.start();
+        }
+
+        function visualizeAudio() {
+            const bufferLength = analyser.frequencyBinCount;
+            const dataArray = new Uint8Array(bufferLength);
+
+            function draw() {
+                requestAnimationFrame(draw);
+
+                analyser.getByteTimeDomainData(dataArray);
+
+                visualizerContext.fillStyle = 'rgb(200, 200, 200)';
+                visualizerContext.fillRect(0, 0, visualizer.width, visualizer.height);
+
+                visualizerContext.lineWidth = 2;
+                visualizerContext.strokeStyle = 'rgb(0, 0, 0)';
+
+                visualizerContext.beginPath();
+
+                const sliceWidth = visualizer.width * 1.0 / bufferLength;
+                let x = 0;
+
+                for (let i = 0; i < bufferLength; i++) {
+                    const v = dataArray[i] / 128.0;
+                    const y = v * visualizer.height / 2;
+
+                    if (i === 0) {
+                        visualizerContext.moveTo(x, y);
+                    } else {
+                        visualizerContext.lineTo(x, y);
+                    }
+
+                    x += sliceWidth;
+                }
+
+                visualizerContext.lineTo(visualizer.width, visualizer.height / 2);
+                visualizerContext.stroke();
+            }
+
+            draw();
+        }
+
+        // Initial connection
+        connectWebSocket();
+    </script>
+</body>
+</html>
\ No newline at end of file
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index ff204b4..049c0b3 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -208,7 +208,9 @@ void set_text_callback(struct transcription_filter_data *gf,
 		str_copy = fix_utf8(str_copy);
 	} else {
 		// only remove leading and trailing non-alphanumeric characters if the output is English
+		obs_log(LOG_INFO, "before: %s", str_copy.c_str());
 		str_copy = remove_leading_trailing_nonalpha(str_copy);
+		obs_log(LOG_INFO, "after: %s", str_copy.c_str());
 	}
 
 	// if suppression is enabled, check if the text is in the suppression list
@@ -411,7 +413,9 @@ void enable_callback(void *data_, calldata_t *cd)
 		obs_log(gf_->log_level, "enable_callback: enable");
 		gf_->active = true;
 		reset_caption_state(gf_);
-		update_whisper_model(gf_);
+		if (!gf_->stenographer_enabled) {
+			update_whisper_model(gf_);
+		}
 	} else {
 		obs_log(gf_->log_level, "enable_callback: disable");
 		gf_->active = false;
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index e8990be..5e7125e 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -19,6 +19,7 @@
 #include "whisper-utils/silero-vad-onnx.h"
 #include "whisper-utils/whisper-processing.h"
 #include "whisper-utils/token-buffer-thread.h"
+#include "stenographer/stenographer.h"
 
 #define MAX_PREPROC_CHANNELS 10
 
@@ -128,6 +129,9 @@ struct transcription_filter_data {
 	TokenBufferSegmentation buffered_output_output_type =
 		TokenBufferSegmentation::SEGMENTATION_TOKEN;
 
+	bool stenographer_enabled = false;
+	TranscriptionHandler *transcription_handler = nullptr;
+
 	// ctor
 	transcription_filter_data() : whisper_buf_mutex(), whisper_ctx_mutex(), wshiper_thread_cv()
 	{
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index a2c9da1..08ed7b1 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -504,6 +504,14 @@ void add_general_group_properties(obs_properties_t *ppts)
 	}
 }
 
+void add_stenographer_group_properties(obs_properties_t *ppts)
+{
+	// add group for stenographer options
+	obs_properties_t *stenographer_group = obs_properties_create();
+	obs_properties_add_group(ppts, "stenographer_group", MT_("stenographer_parameters"),
+				 OBS_GROUP_CHECKABLE, stenographer_group);
+}
+
 void add_partial_group_properties(obs_properties_t *ppts)
 {
 	// add a group for partial transcription
@@ -544,6 +552,7 @@ obs_properties_t *transcription_filter_properties(void *data)
 	add_advanced_group_properties(ppts, gf);
 	add_logging_group_properties(ppts);
 	add_partial_group_properties(ppts);
+	add_stenographer_group_properties(ppts);
 	add_whisper_params_group_properties(ppts);
 
 	// Add a informative text about the plugin
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 65ae072..df1675b 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -80,7 +80,7 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 		return audio;
 	}
 
-	if (gf->whisper_context == nullptr) {
+	if (gf->whisper_context == nullptr && !gf->stenographer_enabled) {
 		// Whisper not initialized, just pass through
 		return audio;
 	}
@@ -103,6 +103,8 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 			circlebuf_push_back(&gf->input_buffers[c], audio->data[c],
 					    audio->frames * sizeof(float));
 		}
+		obs_log(gf->log_level, "currently %lu bytes in the audio input buffer",
+			gf->input_buffers[0].size);
 		// push audio packet info (timestamp/frame count) to info circlebuf
 		struct transcription_filter_audio_info info = {0};
 		info.frames = audio->frames; // number of frames in this packet
@@ -164,6 +166,10 @@ void transcription_filter_destroy(void *data)
 	if (gf->translation_monitor.isEnabled()) {
 		gf->translation_monitor.stopThread();
 	}
+	if (gf->transcription_handler != nullptr) {
+		gf->transcription_handler->stop();
+		delete gf->transcription_handler;
+	}
 
 	bfree(gf);
 }
@@ -404,7 +410,11 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		}
 	}
 
-	if (gf->context != nullptr && obs_source_enabled(gf->context)) {
+	// check if stenographer is enabled
+	bool new_stenographer_enabled = obs_data_get_bool(s, "stenographer_group");
+
+	if (!new_stenographer_enabled && gf->context != nullptr &&
+	    obs_source_enabled(gf->context)) {
 		if (gf->initial_creation) {
 			obs_log(LOG_INFO, "Initial filter creation and source enabled");
 
@@ -424,6 +434,37 @@ void transcription_filter_update(void *data, obs_data_t *s)
 			}
 		}
 	}
+
+	if (new_stenographer_enabled != gf->stenographer_enabled) {
+		gf->stenographer_enabled = new_stenographer_enabled;
+		if (gf->stenographer_enabled) {
+			obs_log(gf->log_level, "Stenographer enabled");
+			shutdown_whisper_thread(gf); // stop whisper
+			gf->transcription_handler = new TranscriptionHandler(
+				gf, [gf](const std::string &type, const std::string &text,
+					 uint64_t start_timestamp, uint64_t end_timestamp) {
+					// send_caption_to_source(gf->text_source_name, text, gf);
+					DetectionResultWithText result;
+					result.text = text;
+					result.result =
+						(type == "partial")
+							? DetectionResult::DETECTION_RESULT_PARTIAL
+							: DetectionResult::DETECTION_RESULT_SPEECH;
+					result.start_timestamp_ms = start_timestamp;
+					result.end_timestamp_ms = end_timestamp;
+					set_text_callback(gf, result);
+				});
+			gf->transcription_handler->start();
+		} else {
+			obs_log(gf->log_level, "Stenographer disabled");
+			if (gf->transcription_handler) {
+				gf->transcription_handler->stop();
+				delete gf->transcription_handler;
+				gf->transcription_handler = nullptr;
+			}
+			update_whisper_model(gf); // restart whisper
+		}
+	}
 }
 
 void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
diff --git a/src/transcription-utils.cpp b/src/transcription-utils.cpp
index 727d3df..48c284a 100644
--- a/src/transcription-utils.cpp
+++ b/src/transcription-utils.cpp
@@ -82,6 +82,36 @@ std::string fix_utf8(const std::string &str)
 #endif
 }
 
+/**
+ * @brief Trims leading and trailing whitespace characters from the given string.
+ * 
+ * This function removes any whitespace characters (spaces, tabs, newlines, etc.)
+ * from the beginning and end of the input string, returning a new string with
+ * the whitespace removed.
+ * 
+ * @param str The input string to be trimmed.
+ * @return A new string with leading and trailing whitespace removed.
+ */
+std::string trim(const std::string& str) {
+    std::string str_copy = str;
+    
+    // remove trailing spaces, newlines, tabs or punctuation
+    auto last_non_space = std::find_if(str_copy.rbegin(), str_copy.rend(), 
+        [](unsigned char ch) {
+            return !std::isspace(ch) && !std::ispunct(ch);
+        }).base();
+    str_copy.erase(last_non_space, str_copy.end());
+    
+    // remove leading spaces, newlines, tabs or punctuation
+    auto first_non_space = std::find_if(str_copy.begin(), str_copy.end(),
+        [](unsigned char ch) {
+            return !std::isspace(ch) && !std::ispunct(ch);
+        });
+    str_copy.erase(str_copy.begin(), first_non_space);
+    
+    return str_copy;
+}
+
 /*
 * Remove leading and trailing non-alphabetic characters from a string.
 * This function is used to remove leading and trailing spaces, newlines, tabs or punctuation.
@@ -111,21 +141,7 @@ std::string remove_leading_trailing_nonalpha(const std::string &str)
 			return "";
 		}
 	}
-	std::string str_copy = str;
-	// remove trailing spaces, newlines, tabs or punctuation
-	auto last_non_space =
-		std::find_if(str_copy.rbegin(), str_copy.rend(), [](unsigned char ch) {
-			return !std::isspace(ch) || !std::ispunct(ch);
-		}).base();
-	str_copy.erase(last_non_space, str_copy.end());
-	// remove leading spaces, newlines, tabs or punctuation
-	auto first_non_space = std::find_if(str_copy.begin(), str_copy.end(),
-					    [](unsigned char ch) {
-						    return !std::isspace(ch) || !std::ispunct(ch);
-					    }) +
-			       1;
-	str_copy.erase(str_copy.begin(), first_non_space);
-	return str_copy;
+	return trim(str);
 }
 
 std::vector<std::string> split(const std::string &string, char delimiter)
diff --git a/src/whisper-utils/resample-utils.cpp b/src/whisper-utils/resample-utils.cpp
new file mode 100644
index 0000000..7533b61
--- /dev/null
+++ b/src/whisper-utils/resample-utils.cpp
@@ -0,0 +1,98 @@
+#include <util/profiler.hpp>
+
+#include "resample-utils.h"
+
+int get_data_from_buf_and_resample(transcription_filter_data *gf,
+				   uint64_t &start_timestamp_offset_ns,
+				   uint64_t &end_timestamp_offset_ns)
+{
+	uint32_t num_frames_from_infos = 0;
+
+	{
+		// scoped lock the buffer mutex
+		std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
+
+		if (gf->input_buffers[0].size == 0) {
+			return 1;
+		}
+
+		obs_log(gf->log_level,
+			"segmentation: currently %lu bytes in the audio input buffer",
+			gf->input_buffers[0].size);
+
+		// max number of frames is 10 seconds worth of audio
+		const size_t max_num_frames = gf->sample_rate * 10;
+
+		// pop all infos from the info buffer and mark the beginning timestamp from the first
+		// info as the beginning timestamp of the segment
+		struct transcription_filter_audio_info info_from_buf = {0};
+		const size_t size_of_audio_info = sizeof(transcription_filter_audio_info);
+		while (gf->info_buffer.size >= size_of_audio_info) {
+			circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info);
+			num_frames_from_infos += info_from_buf.frames;
+			if (start_timestamp_offset_ns == 0) {
+				start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
+			}
+			// Check if we're within the needed segment length
+			if (num_frames_from_infos > max_num_frames) {
+				// too big, push the last info into the buffer's front where it was
+				num_frames_from_infos -= info_from_buf.frames;
+				circlebuf_push_front(&gf->info_buffer, &info_from_buf,
+						     size_of_audio_info);
+				break;
+			}
+		}
+		// calculate the end timestamp from the info plus the number of frames in the packet
+		end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns +
+					  info_from_buf.frames * 1000000000 / gf->sample_rate;
+
+		if (start_timestamp_offset_ns > end_timestamp_offset_ns) {
+			// this may happen when the incoming media has a timestamp reset
+			// in this case, we should figure out the start timestamp from the end timestamp
+			// and the number of frames
+			start_timestamp_offset_ns =
+				end_timestamp_offset_ns -
+				num_frames_from_infos * 1000000000 / gf->sample_rate;
+		}
+
+		for (size_t c = 0; c < gf->channels; c++) {
+			// zero the rest of copy_buffers
+			memset(gf->copy_buffers[c], 0, gf->frames * sizeof(float));
+		}
+
+		/* Pop from input circlebuf */
+		for (size_t c = 0; c < gf->channels; c++) {
+			// Push the new data to copy_buffers[c]
+			circlebuf_pop_front(&gf->input_buffers[c], gf->copy_buffers[c],
+					    num_frames_from_infos * sizeof(float));
+		}
+	}
+
+	obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos);
+	gf->last_num_frames = num_frames_from_infos;
+
+	{
+		// resample to 16kHz
+		float *resampled_16khz[MAX_PREPROC_CHANNELS];
+		uint32_t resampled_16khz_frames;
+		uint64_t ts_offset;
+		{
+			ProfileScope("resample");
+			audio_resampler_resample(gf->resampler_to_whisper,
+						 (uint8_t **)resampled_16khz,
+						 &resampled_16khz_frames, &ts_offset,
+						 (const uint8_t **)gf->copy_buffers,
+						 (uint32_t)num_frames_from_infos);
+		}
+
+		circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
+				    resampled_16khz_frames * sizeof(float));
+		obs_log(gf->log_level,
+			"resampled: %d channels, %d frames, %f ms, current size: %lu bytes",
+			(int)gf->channels, (int)resampled_16khz_frames,
+			(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f,
+			gf->resampled_buffer.size);
+	}
+
+	return 0;
+}
diff --git a/src/whisper-utils/resample-utils.h b/src/whisper-utils/resample-utils.h
new file mode 100644
index 0000000..c2d2872
--- /dev/null
+++ b/src/whisper-utils/resample-utils.h
@@ -0,0 +1,10 @@
+#ifndef RESAMPLE_UTILS_H
+#define RESAMPLE_UTILS_H
+
+#include "transcription-filter-data.h"
+
+int get_data_from_buf_and_resample(transcription_filter_data *gf,
+				   uint64_t &start_timestamp_offset_ns,
+				   uint64_t &end_timestamp_offset_ns);
+
+#endif
diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp
index 0e9c744..d0a9266 100644
--- a/src/whisper-utils/vad-processing.cpp
+++ b/src/whisper-utils/vad-processing.cpp
@@ -4,107 +4,13 @@
 #include "transcription-filter-data.h"
 
 #include "vad-processing.h"
+#include "resample-utils.h"
 
 #ifdef _WIN32
 #define NOMINMAX
 #include <Windows.h>
 #endif
 
-int get_data_from_buf_and_resample(transcription_filter_data *gf,
-				   uint64_t &start_timestamp_offset_ns,
-				   uint64_t &end_timestamp_offset_ns)
-{
-	uint32_t num_frames_from_infos = 0;
-
-	{
-		// scoped lock the buffer mutex
-		std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
-
-		if (gf->input_buffers[0].size == 0) {
-			return 1;
-		}
-
-		obs_log(gf->log_level,
-			"segmentation: currently %lu bytes in the audio input buffer",
-			gf->input_buffers[0].size);
-
-		// max number of frames is 10 seconds worth of audio
-		const size_t max_num_frames = gf->sample_rate * 10;
-
-		// pop all infos from the info buffer and mark the beginning timestamp from the first
-		// info as the beginning timestamp of the segment
-		struct transcription_filter_audio_info info_from_buf = {0};
-		const size_t size_of_audio_info = sizeof(transcription_filter_audio_info);
-		while (gf->info_buffer.size >= size_of_audio_info) {
-			circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info);
-			num_frames_from_infos += info_from_buf.frames;
-			if (start_timestamp_offset_ns == 0) {
-				start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns;
-			}
-			// Check if we're within the needed segment length
-			if (num_frames_from_infos > max_num_frames) {
-				// too big, push the last info into the buffer's front where it was
-				num_frames_from_infos -= info_from_buf.frames;
-				circlebuf_push_front(&gf->info_buffer, &info_from_buf,
-						     size_of_audio_info);
-				break;
-			}
-		}
-		// calculate the end timestamp from the info plus the number of frames in the packet
-		end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns +
-					  info_from_buf.frames * 1000000000 / gf->sample_rate;
-
-		if (start_timestamp_offset_ns > end_timestamp_offset_ns) {
-			// this may happen when the incoming media has a timestamp reset
-			// in this case, we should figure out the start timestamp from the end timestamp
-			// and the number of frames
-			start_timestamp_offset_ns =
-				end_timestamp_offset_ns -
-				num_frames_from_infos * 1000000000 / gf->sample_rate;
-		}
-
-		for (size_t c = 0; c < gf->channels; c++) {
-			// zero the rest of copy_buffers
-			memset(gf->copy_buffers[c], 0, gf->frames * sizeof(float));
-		}
-
-		/* Pop from input circlebuf */
-		for (size_t c = 0; c < gf->channels; c++) {
-			// Push the new data to copy_buffers[c]
-			circlebuf_pop_front(&gf->input_buffers[c], gf->copy_buffers[c],
-					    num_frames_from_infos * sizeof(float));
-		}
-	}
-
-	obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos);
-	gf->last_num_frames = num_frames_from_infos;
-
-	{
-		// resample to 16kHz
-		float *resampled_16khz[MAX_PREPROC_CHANNELS];
-		uint32_t resampled_16khz_frames;
-		uint64_t ts_offset;
-		{
-			ProfileScope("resample");
-			audio_resampler_resample(gf->resampler_to_whisper,
-						 (uint8_t **)resampled_16khz,
-						 &resampled_16khz_frames, &ts_offset,
-						 (const uint8_t **)gf->copy_buffers,
-						 (uint32_t)num_frames_from_infos);
-		}
-
-		circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
-				    resampled_16khz_frames * sizeof(float));
-		obs_log(gf->log_level,
-			"resampled: %d channels, %d frames, %f ms, current size: %lu bytes",
-			(int)gf->channels, (int)resampled_16khz_frames,
-			(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f,
-			gf->resampled_buffer.size);
-	}
-
-	return 0;
-}
-
 vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state)
 {
 	// get data from buffer and resample

From b3a0316ccd8fda5a9411118a2c04869e11c5eb5b Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Sun, 6 Oct 2024 14:27:00 -0400
Subject: [PATCH 03/20] refactor: Add stenographer delay option

Added a new option for stenographer delay in the stenographer group properties. This allows users to specify the amount of delay for partial transcription. The default delay is set to 10,000 milliseconds.

Fixes #<issue_number>
---
 data/locale/en-US.ini                   | 1 +
 src/transcription-filter-data.h         | 1 +
 src/transcription-filter-properties.cpp | 6 ++++++
 src/transcription-filter.cpp            | 2 +-
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
index 94eb613..97d0835 100644
--- a/data/locale/en-US.ini
+++ b/data/locale/en-US.ini
@@ -89,3 +89,4 @@ translate_only_full_sentences="Translate only full sentences"
 duration_filter_threshold="Duration filter"
 segment_duration="Segment duration"
 stenographer_parameters="Stenographer Options"
+stenographer_delay="Audio Delay"
\ No newline at end of file
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 5e7125e..031af38 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -131,6 +131,7 @@ struct transcription_filter_data {
 
 	bool stenographer_enabled = false;
 	TranscriptionHandler *transcription_handler = nullptr;
+	int stenographer_delay = 0;
 
 	// ctor
 	transcription_filter_data() : whisper_buf_mutex(), whisper_ctx_mutex(), wshiper_thread_cv()
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index 08ed7b1..c639b03 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -510,6 +510,10 @@ void add_stenographer_group_properties(obs_properties_t *ppts)
 	obs_properties_t *stenographer_group = obs_properties_create();
 	obs_properties_add_group(ppts, "stenographer_group", MT_("stenographer_parameters"),
 				 OBS_GROUP_CHECKABLE, stenographer_group);
+
+	// add delay amount for partial transcription
+	obs_properties_add_int_slider(stenographer_group, "stenographer_delay", MT_("stenographer_delay"),
+				      1000, 12000, 100);
 }
 
 void add_partial_group_properties(obs_properties_t *ppts)
@@ -603,6 +607,8 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_double(s, "sentence_psum_accept_thresh", 0.4);
 	obs_data_set_default_bool(s, "partial_group", false);
 	obs_data_set_default_int(s, "partial_latency", 1100);
+	obs_data_set_default_bool(s, "stenographer_group", false);
+	obs_data_set_default_int(s, "stenographer_delay", 10000);
 
 	// translation options
 	obs_data_set_default_double(s, "translation_sampling_temperature", 0.1);
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index df1675b..fc0a1fe 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -440,10 +440,10 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		if (gf->stenographer_enabled) {
 			obs_log(gf->log_level, "Stenographer enabled");
 			shutdown_whisper_thread(gf); // stop whisper
+			gf->stenographer_delay = (int)obs_data_get_int(s, "stenographer_delay");
 			gf->transcription_handler = new TranscriptionHandler(
 				gf, [gf](const std::string &type, const std::string &text,
 					 uint64_t start_timestamp, uint64_t end_timestamp) {
-					// send_caption_to_source(gf->text_source_name, text, gf);
 					DetectionResultWithText result;
 					result.text = text;
 					result.result =

From db155bb466500cc6b783efd17167a245ed781220 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 7 Oct 2024 12:22:56 -0400
Subject: [PATCH 04/20] refactor: Add resample-utils.cpp and update
 stenographer delay

This commit adds the file resample-utils.cpp to the whisper-utils directory. It also updates the stenographer delay functionality in stenographer.cpp and transcription-filter.cpp. The delay is now set to 1000 milliseconds. These changes improve the audio processing and transcription capabilities of the application.
---
 CMakeLists.txt                          |  1 +
 src/stenographer/stenographer.cpp       |  4 ++--
 src/tests/localvocal-offline-test.cpp   | 15 +++++++++++++++
 src/transcription-filter-data.h         |  3 ++-
 src/transcription-filter-properties.cpp |  4 ++--
 src/transcription-filter.cpp            | 25 +++++++++++++++++++++++++
 6 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f29aee0..fb11342 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -168,6 +168,7 @@ if(ENABLE_TESTS)
             src/whisper-utils/silero-vad-onnx.cpp
             src/whisper-utils/token-buffer-thread.cpp
             src/whisper-utils/vad-processing.cpp
+            src/whisper-utils/resample-utils.cpp
             src/translation/language_codes.cpp
             src/translation/translation.cpp
             src/ui/filter-replace-utils.cpp
diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp
index b8af2c6..269dee1 100644
--- a/src/stenographer/stenographer.cpp
+++ b/src/stenographer/stenographer.cpp
@@ -122,7 +122,7 @@ class TranscriptionHandler::Impl {
 	void processAudioQueue()
 	{
 		while (running) {
-			// get data from buffer and resample
+			// get data from buffer and resample to 16kHz
 			uint64_t start_timestamp_offset_ns = 0;
 			uint64_t end_timestamp_offset_ns = 0;
 
@@ -217,4 +217,4 @@ void TranscriptionHandler::start()
 void TranscriptionHandler::stop()
 {
 	pimpl->stop();
-}
\ No newline at end of file
+}
diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
index 3c0f4a4..5635045 100644
--- a/src/tests/localvocal-offline-test.cpp
+++ b/src/tests/localvocal-offline-test.cpp
@@ -328,6 +328,21 @@ void set_text_callback(struct transcription_filter_data *gf,
 	}
 };
 
+void clear_current_caption(transcription_filter_data *gf_)
+{
+	if (gf_->captions_monitor.isEnabled()) {
+		gf_->captions_monitor.clear();
+		gf_->translation_monitor.clear();
+	}
+	// reset translation context
+	gf_->last_text_for_translation = "";
+	gf_->last_text_translation = "";
+	gf_->translation_ctx.last_input_tokens.clear();
+	gf_->translation_ctx.last_translation_tokens.clear();
+	gf_->last_transcription_sentence.clear();
+	gf_->cleared_last_sub = true;
+}
+
 void release_context(transcription_filter_data *gf)
 {
 	obs_log(LOG_INFO, "destroy");
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 031af38..a721b8c 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -131,7 +131,8 @@ struct transcription_filter_data {
 
 	bool stenographer_enabled = false;
 	TranscriptionHandler *transcription_handler = nullptr;
-	int stenographer_delay = 0;
+	int stenographer_delay = 1000;
+	std::deque<float> stenographer_delay_buffers[MAX_PREPROC_CHANNELS];
 
 	// ctor
 	transcription_filter_data() : whisper_buf_mutex(), whisper_ctx_mutex(), wshiper_thread_cv()
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index c639b03..cc90365 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -512,8 +512,8 @@ void add_stenographer_group_properties(obs_properties_t *ppts)
 				 OBS_GROUP_CHECKABLE, stenographer_group);
 
 	// add delay amount for partial transcription
-	obs_properties_add_int_slider(stenographer_group, "stenographer_delay", MT_("stenographer_delay"),
-				      1000, 12000, 100);
+	obs_properties_add_int_slider(stenographer_group, "stenographer_delay",
+				      MT_("stenographer_delay"), 1000, 12000, 100);
 }
 
 void add_partial_group_properties(obs_properties_t *ppts)
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index fc0a1fe..93e3b9b 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -114,6 +114,31 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 		gf->wshiper_thread_cv.notify_one();
 	}
 
+	if (gf->stenographer_enabled) {
+		// Stenographer mode - apply delay.
+		// Store the audio data in a buffer and process it after the delay.
+		// push the data to the back of gf->stenographer_delay_buffer
+		for (size_t c = 0; c < gf->channels; c++) {
+			for (size_t i = 0; i < audio->frames; i++) {
+				gf->stenographer_delay_buffers[c].push_back(audio->data[c][i]);
+			}
+		}
+
+		// If the buffer is larger than the delay, emit the oldest data
+		// Take from the buffer as much as requested by the incoming audio data
+		size_t delay_frames = gf->sample_rate * gf->stenographer_delay / 1000;
+		if (gf->stenographer_delay_buffers[0].size() >= delay_frames) {
+			// Replace data on the audio buffer with the delayed data
+			for (size_t c = 0; c < gf->channels; c++) {
+				for (size_t i = 0; i < audio->frames; i++) {
+					audio->data[c][i] =
+						gf->stenographer_delay_buffers[c].front();
+					gf->stenographer_delay_buffers[c].pop_front();
+				}
+			}
+		}
+	}
+
 	return audio;
 }
 

From fcb79efa1ccc08487c35f4348a40dc53d3abe7f6 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 7 Oct 2024 15:40:33 -0400
Subject: [PATCH 05/20] refactor: Update stenographer delay variable name

Update the variable name from "stenographer_delay" to "stenographer_delay_ms" in the transcription filter code. This change reflects the unit of the delay value in milliseconds. The code has been modified in the "transcription-filter-data.h" and "transcription-filter.cpp" files.
---
 src/transcription-filter-data.h |  2 +-
 src/transcription-filter.cpp    | 43 +++++++++++++++++++++++++--------
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index a721b8c..76d39e8 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -131,7 +131,7 @@ struct transcription_filter_data {
 
 	bool stenographer_enabled = false;
 	TranscriptionHandler *transcription_handler = nullptr;
-	int stenographer_delay = 1000;
+	int stenographer_delay_ms = 1000;
 	std::deque<float> stenographer_delay_buffers[MAX_PREPROC_CHANNELS];
 
 	// ctor
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 93e3b9b..25cbba9 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -119,22 +119,45 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 		// Store the audio data in a buffer and process it after the delay.
 		// push the data to the back of gf->stenographer_delay_buffer
 		for (size_t c = 0; c < gf->channels; c++) {
-			for (size_t i = 0; i < audio->frames; i++) {
-				gf->stenographer_delay_buffers[c].push_back(audio->data[c][i]);
-			}
+			// take a audio->frames * sizeof(float) bytes chunk from audio->data[c] and push it
+			// to the back of the buffer as a float
+			std::vector<float> audio_data_chunk(
+				(float *)audio->data[c], ((float *)audio->data[c]) + audio->frames);
+			gf->stenographer_delay_buffers[c].insert(
+				gf->stenographer_delay_buffers[c].end(), audio_data_chunk.begin(),
+				audio_data_chunk.end());
 		}
 
 		// If the buffer is larger than the delay, emit the oldest data
 		// Take from the buffer as much as requested by the incoming audio data
-		size_t delay_frames = gf->sample_rate * gf->stenographer_delay / 1000;
+		size_t delay_frames = (size_t)((float)gf->sample_rate *
+					       (float)gf->stenographer_delay_ms / 1000.0f) +
+				      audio->frames;
 		if (gf->stenographer_delay_buffers[0].size() >= delay_frames) {
+			obs_log(LOG_INFO,
+				"Stenographer delay buffer filled %lu/%lu. Sending %lu frames",
+				gf->stenographer_delay_buffers[0].size(), delay_frames,
+				audio->frames);
 			// Replace data on the audio buffer with the delayed data
 			for (size_t c = 0; c < gf->channels; c++) {
-				for (size_t i = 0; i < audio->frames; i++) {
-					audio->data[c][i] =
-						gf->stenographer_delay_buffers[c].front();
-					gf->stenographer_delay_buffers[c].pop_front();
-				}
+				// Take the oldest audio->frames from the buffer and put it in the audio buffer
+				// as bytes
+				std::vector<float> audio_data_chunk(
+					gf->stenographer_delay_buffers[c].begin(),
+					gf->stenographer_delay_buffers[c].begin() + audio->frames);
+				memcpy(audio->data[c], audio_data_chunk.data(),
+				       audio->frames * sizeof(float));
+				// Remove the oldest audio->frames from the buffer
+				gf->stenographer_delay_buffers[c].erase(
+					gf->stenographer_delay_buffers[c].begin(),
+					gf->stenographer_delay_buffers[c].begin() + audio->frames);
+			}
+		} else {
+			obs_log(LOG_INFO, "Stenographer delay buffer not filled yet %lu/%lu",
+				gf->stenographer_delay_buffers[0].size(), delay_frames);
+			// Fill the audio buffer with silence
+			for (size_t c = 0; c < gf->channels; c++) {
+				memset(audio->data[c], 0, audio->frames * sizeof(float));
 			}
 		}
 	}
@@ -465,7 +488,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 		if (gf->stenographer_enabled) {
 			obs_log(gf->log_level, "Stenographer enabled");
 			shutdown_whisper_thread(gf); // stop whisper
-			gf->stenographer_delay = (int)obs_data_get_int(s, "stenographer_delay");
+			gf->stenographer_delay_ms = (int)obs_data_get_int(s, "stenographer_delay");
 			gf->transcription_handler = new TranscriptionHandler(
 				gf, [gf](const std::string &type, const std::string &text,
 					 uint64_t start_timestamp, uint64_t end_timestamp) {

From 49538e7f8882da5332523121e6e6443e9a294daf Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 7 Oct 2024 15:47:10 -0400
Subject: [PATCH 06/20] refactor: Update stenographer interface buttons and add
 pause/resume functionality

---
 src/stenographer/stenographer_interface.html | 76 ++++++++++++++++++--
 1 file changed, 71 insertions(+), 5 deletions(-)

diff --git a/src/stenographer/stenographer_interface.html b/src/stenographer/stenographer_interface.html
index 0b5eea8..d50b2dc 100644
--- a/src/stenographer/stenographer_interface.html
+++ b/src/stenographer/stenographer_interface.html
@@ -15,11 +15,23 @@
 </head>
 <body>
     <h1>Stenographer Interface</h1>
-    <button id="startButton">Start Audio</button>
+    <button id="startAudioBtn">Start Audio</button>
+    <button id="pauseResumeBtn">Pause Audio</button>
+
+    <div id="connectionStatus">
+        Connection Status: <span id="statusText" class="status-disconnected">Disconnected</span>
+    </div>
+
+    <div id="audioStatus">
+        Audio Status: <span id="audioStatusText">Not started</span>
+    </div>
+
+    <div>
+        <p>Hotkey: Press Alt+P to pause/resume audio</p>
+    </div>
+
     <canvas id="visualizer"></canvas>
     <textarea id="captionInput" placeholder="Enter captions here..."></textarea>
-    <div id="connectionStatus">Connection Status: <span id="statusText" class="status-disconnected">Disconnected</span></div>
-    <div id="audioStatus">Audio Status: Not started</div>
 
     <script>
         const WS_URL = 'ws://localhost:9002';
@@ -43,6 +55,25 @@ <h1>Stenographer Interface</h1>
 
         let currentStartTimestamp = 0;
 
+        let isAudioPlaying = false;
+        let isPaused = false;
+
+        const startAudioBtn = document.getElementById('startAudioBtn');
+        const pauseResumeBtn = document.getElementById('pauseResumeBtn');
+        const statusText = document.getElementById('statusText');
+        const audioStatusText = document.getElementById('audioStatusText');
+
+        startAudioBtn.addEventListener('click', toggleAudio);
+        pauseResumeBtn.addEventListener('click', togglePause);
+
+        // Add event listener for the Alt+P key combination
+        document.addEventListener('keydown', (event) => {
+            if (event.altKey && (event.key === 'p' || event.key === 'P')) {
+                event.preventDefault(); // Prevent default Alt+P behavior
+                togglePause();
+            }
+        });
+
         function connectWebSocket() {
             ws = new WebSocket(WS_URL);
 
@@ -150,7 +181,7 @@ <h1>Stenographer Interface</h1>
                     end_timestamp: type === 'sentence' ? now : 0 // 0 for partial sentences
                 });
                 ws.send(message);
-                
+
                 if (type === 'sentence') {
                     currentStartTimestamp = now; // Reset start timestamp for next sentence
                 }
@@ -216,8 +247,43 @@ <h1>Stenographer Interface</h1>
             draw();
         }
 
+        function toggleAudio() {
+            if (!isAudioPlaying) {
+                // Logic to start audio (replace with actual WebSocket logic)
+                isAudioPlaying = true;
+                statusText.textContent = 'Connected';
+                statusText.className = 'status-connected';
+                audioStatusText.textContent = 'Playing';
+                startAudioBtn.textContent = 'Stop Audio';
+            } else {
+                // Logic to stop audio
+                isAudioPlaying = false;
+                isPaused = false;
+                statusText.textContent = 'Disconnected';
+                statusText.className = 'status-disconnected';
+                audioStatusText.textContent = 'Not started';
+                startAudioBtn.textContent = 'Start Audio';
+                pauseResumeBtn.textContent = 'Pause Audio';
+            }
+        }
+
+        function togglePause() {
+            if (!isAudioPlaying) return;
+
+            isPaused = !isPaused;
+            if (isPaused) {
+                // Logic to pause audio
+                audioStatusText.textContent = 'Paused';
+                pauseResumeBtn.textContent = 'Resume Audio';
+            } else {
+                // Logic to resume audio
+                audioStatusText.textContent = 'Playing';
+                pauseResumeBtn.textContent = 'Pause Audio';
+            }
+        }
+
         // Initial connection
         connectWebSocket();
     </script>
 </body>
-</html>
\ No newline at end of file
+</html>

From 37f84399041f5c43024db5188c7ab2e81c16e4f8 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 7 Oct 2024 22:58:13 -0400
Subject: [PATCH 07/20] Revert "refactor: Update stenographer interface buttons
 and add pause/resume functionality"

This reverts commit 49538e7f8882da5332523121e6e6443e9a294daf.
---
 src/stenographer/stenographer_interface.html | 76 ++------------------
 1 file changed, 5 insertions(+), 71 deletions(-)

diff --git a/src/stenographer/stenographer_interface.html b/src/stenographer/stenographer_interface.html
index d50b2dc..0b5eea8 100644
--- a/src/stenographer/stenographer_interface.html
+++ b/src/stenographer/stenographer_interface.html
@@ -15,23 +15,11 @@
 </head>
 <body>
     <h1>Stenographer Interface</h1>
-    <button id="startAudioBtn">Start Audio</button>
-    <button id="pauseResumeBtn">Pause Audio</button>
-
-    <div id="connectionStatus">
-        Connection Status: <span id="statusText" class="status-disconnected">Disconnected</span>
-    </div>
-
-    <div id="audioStatus">
-        Audio Status: <span id="audioStatusText">Not started</span>
-    </div>
-
-    <div>
-        <p>Hotkey: Press Alt+P to pause/resume audio</p>
-    </div>
-
+    <button id="startButton">Start Audio</button>
     <canvas id="visualizer"></canvas>
     <textarea id="captionInput" placeholder="Enter captions here..."></textarea>
+    <div id="connectionStatus">Connection Status: <span id="statusText" class="status-disconnected">Disconnected</span></div>
+    <div id="audioStatus">Audio Status: Not started</div>
 
     <script>
         const WS_URL = 'ws://localhost:9002';
@@ -55,25 +43,6 @@ <h1>Stenographer Interface</h1>
 
         let currentStartTimestamp = 0;
 
-        let isAudioPlaying = false;
-        let isPaused = false;
-
-        const startAudioBtn = document.getElementById('startAudioBtn');
-        const pauseResumeBtn = document.getElementById('pauseResumeBtn');
-        const statusText = document.getElementById('statusText');
-        const audioStatusText = document.getElementById('audioStatusText');
-
-        startAudioBtn.addEventListener('click', toggleAudio);
-        pauseResumeBtn.addEventListener('click', togglePause);
-
-        // Add event listener for the Alt+P key combination
-        document.addEventListener('keydown', (event) => {
-            if (event.altKey && (event.key === 'p' || event.key === 'P')) {
-                event.preventDefault(); // Prevent default Alt+P behavior
-                togglePause();
-            }
-        });
-
         function connectWebSocket() {
             ws = new WebSocket(WS_URL);
 
@@ -181,7 +150,7 @@ <h1>Stenographer Interface</h1>
                     end_timestamp: type === 'sentence' ? now : 0 // 0 for partial sentences
                 });
                 ws.send(message);
-
+                
                 if (type === 'sentence') {
                     currentStartTimestamp = now; // Reset start timestamp for next sentence
                 }
@@ -247,43 +216,8 @@ <h1>Stenographer Interface</h1>
             draw();
         }
 
-        function toggleAudio() {
-            if (!isAudioPlaying) {
-                // Logic to start audio (replace with actual WebSocket logic)
-                isAudioPlaying = true;
-                statusText.textContent = 'Connected';
-                statusText.className = 'status-connected';
-                audioStatusText.textContent = 'Playing';
-                startAudioBtn.textContent = 'Stop Audio';
-            } else {
-                // Logic to stop audio
-                isAudioPlaying = false;
-                isPaused = false;
-                statusText.textContent = 'Disconnected';
-                statusText.className = 'status-disconnected';
-                audioStatusText.textContent = 'Not started';
-                startAudioBtn.textContent = 'Start Audio';
-                pauseResumeBtn.textContent = 'Pause Audio';
-            }
-        }
-
-        function togglePause() {
-            if (!isAudioPlaying) return;
-
-            isPaused = !isPaused;
-            if (isPaused) {
-                // Logic to pause audio
-                audioStatusText.textContent = 'Paused';
-                pauseResumeBtn.textContent = 'Resume Audio';
-            } else {
-                // Logic to resume audio
-                audioStatusText.textContent = 'Playing';
-                pauseResumeBtn.textContent = 'Pause Audio';
-            }
-        }
-
         // Initial connection
         connectWebSocket();
     </script>
 </body>
-</html>
+</html>
\ No newline at end of file

From bca37a7b16e62b0b37d85be29344762806e38bf8 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 7 Oct 2024 23:37:16 -0400
Subject: [PATCH 08/20] refactor: Clear stenographer delay buffers when
 resetting caption state

This commit modifies the `reset_caption_state` function in `transcription-filter-callbacks.cpp` to clear the `stenographer_delay_buffers` when resetting the caption state. This ensures that the buffers are empty and ready for new data.

Additionally, the `channels` variable in `transcription-filter-data.h` is updated to represent the number of channels in the input.

These changes improve the functionality and maintainability of the code.
---
 src/transcription-filter-callbacks.cpp | 4 ++--
 src/transcription-filter-data.h        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index 049c0b3..6a8169a 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -352,6 +352,7 @@ void reset_caption_state(transcription_filter_data *gf_)
 			if (gf_->input_buffers[c].data != nullptr) {
 				circlebuf_free(&gf_->input_buffers[c]);
 			}
+			gf_->stenographer_delay_buffers[c].clear();
 		}
 		if (gf_->info_buffer.data != nullptr) {
 			circlebuf_free(&gf_->info_buffer);
@@ -409,17 +410,16 @@ void enable_callback(void *data_, calldata_t *cd)
 {
 	transcription_filter_data *gf_ = static_cast<struct transcription_filter_data *>(data_);
 	bool enable = calldata_bool(cd, "enabled");
+	reset_caption_state(gf_);
 	if (enable) {
 		obs_log(gf_->log_level, "enable_callback: enable");
 		gf_->active = true;
-		reset_caption_state(gf_);
 		if (!gf_->stenographer_enabled) {
 			update_whisper_model(gf_);
 		}
 	} else {
 		obs_log(gf_->log_level, "enable_callback: disable");
 		gf_->active = false;
-		reset_caption_state(gf_);
 		shutdown_whisper_thread(gf_);
 	}
 }
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 76d39e8..996faa5 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -25,7 +25,7 @@
 
 struct transcription_filter_data {
 	obs_source_t *context; // obs filter source (this filter)
-	size_t channels;       // number of channels
+	size_t channels;       // number of channels in the input
 	uint32_t sample_rate;  // input sample rate
 	// How many input frames (in input sample rate) are needed for the next whisper frame
 	size_t frames;

From aefd87d6c4d9eb59fd157bbde3be93fb4a609659 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 10 Oct 2024 09:08:40 -0400
Subject: [PATCH 09/20] Refactor CMakeLists.txt and stenographer.cpp

This commit refactors the CMakeLists.txt file by adding a check for the buildspec variable and reading the buildspec.json file if it is not set. It also sets the arch variable based on the platform. In stenographer.cpp, the code now includes websocketpp/config/asio_no_tls.hpp only if the platform is not Linux. Additionally, the unused parameter hdl is now marked as unused in the message handler.
---
 CMakeLists.txt                    | 6 +++---
 src/stenographer/stenographer.cpp | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6501163..4e53fb6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -102,7 +102,7 @@ target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
 target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})
 
 if(NOT buildspec)
-file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec)
+  file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec)
 endif()
 string(
   JSON
@@ -113,9 +113,9 @@ string(
   prebuilt
   version)
 if(MSVC)
-set(arch ${CMAKE_GENERATOR_PLATFORM})
+  set(arch ${CMAKE_GENERATOR_PLATFORM})
 elseif(APPLE)
-set(arch universal)
+  set(arch universal)
 endif()
 set(deps_root "${CMAKE_CURRENT_SOURCE_DIR}/.deps/obs-deps-${version}-${arch}")
 message(STATUS "deps_root: ${deps_root}")
diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp
index 269dee1..c11ac80 100644
--- a/src/stenographer/stenographer.cpp
+++ b/src/stenographer/stenographer.cpp
@@ -8,7 +8,9 @@
 #define ASIO_STANDALONE
 #define _WEBSOCKETPP_CPP11_TYPE_TRAITS_
 
+#ifndef __linux__
 #include <websocketpp/config/asio_no_tls.hpp>
+#endif
 #include <websocketpp/server.hpp>
 #include <nlohmann/json.hpp>
 #include <queue>
@@ -54,6 +56,7 @@ class TranscriptionHandler::Impl {
 
 		server.set_message_handler(
 			[this](websocketpp::connection_hdl hdl, server::message_ptr msg) {
+				UNUSED_PARAMETER(hdl);
 				handleIncomingMessage(msg->get_payload());
 			});
 

From 1de935758f2e369b863a58c69b67a818f3918796 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 10 Oct 2024 10:48:19 -0400
Subject: [PATCH 10/20] Refactor CMakeLists.txt and stenographer.cpp

---
 CMakeLists.txt               | 41 ++++++++++++++++++++----------------
 cmake/FetchWebsocketpp.cmake |  9 ++++++++
 2 files changed, 32 insertions(+), 18 deletions(-)
 create mode 100644 cmake/FetchWebsocketpp.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e53fb6..3cd971d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -101,25 +101,30 @@ include(cmake/BuildICU.cmake)
 target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
 target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})
 
-if(NOT buildspec)
-  file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec)
-endif()
-string(
-  JSON
-  version
-  GET
-  ${buildspec}
-  dependencies
-  prebuilt
-  version)
-if(MSVC)
-  set(arch ${CMAKE_GENERATOR_PLATFORM})
-elseif(APPLE)
-  set(arch universal)
+if(MSVC or APPLE)
+  if(NOT buildspec)
+    file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec)
+  endif()
+  string(
+    JSON
+    version
+    GET
+    ${buildspec}
+    dependencies
+    prebuilt
+    version)
+  if(MSVC)
+    set(arch ${CMAKE_GENERATOR_PLATFORM})
+  elseif(APPLE)
+    set(arch universal)
+  endif()
+  set(deps_root "${CMAKE_CURRENT_SOURCE_DIR}/.deps/obs-deps-${version}-${arch}")
+  message(STATUS "deps_root: ${deps_root}")
+  target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE "${deps_root}/include")
+else()
+  include(cmake/FetchWebsocketpp.cmake)
+  target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${websocketpp_SOURCE_DIR}/)
 endif()
-set(deps_root "${CMAKE_CURRENT_SOURCE_DIR}/.deps/obs-deps-${version}-${arch}")
-message(STATUS "deps_root: ${deps_root}")
-target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE "${deps_root}/include")
 
 target_sources(
   ${CMAKE_PROJECT_NAME}
diff --git a/cmake/FetchWebsocketpp.cmake b/cmake/FetchWebsocketpp.cmake
new file mode 100644
index 0000000..1c60239
--- /dev/null
+++ b/cmake/FetchWebsocketpp.cmake
@@ -0,0 +1,9 @@
+include(FetchContent)
+
+FetchContent_Declare(
+  websocketpp
+  URL https://github.com/zaphoyd/websocketpp/archive/refs/tags/0.8.2.tar.gz
+  URL_HASH SHA256=6ce889d85ecdc2d8fa07408d6787e7352510750daa66b5ad44aacb47bea76755
+)
+
+FetchContent_MakeAvailable(websocketpp)

From 0c1ee7050072782b92a595b7e6620a39da81f0cf Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 10 Oct 2024 11:05:54 -0400
Subject: [PATCH 11/20] Refactor CMakeLists.txt and stenographer.cpp

---
 CMakeLists.txt                    |  3 ++-
 cmake/FetchWebsocketpp.cmake      | 25 ++++++++++++++++---------
 cmake/linux/compilerconfig.cmake  |  8 ++++----
 src/stenographer/stenographer.cpp |  2 --
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3cd971d..5c4890c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -101,7 +101,7 @@ include(cmake/BuildICU.cmake)
 target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU)
 target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR})
 
-if(MSVC or APPLE)
+if(WIN32 OR APPLE)
   if(NOT buildspec)
     file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec)
   endif()
@@ -124,6 +124,7 @@ if(MSVC or APPLE)
 else()
   include(cmake/FetchWebsocketpp.cmake)
   target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${websocketpp_SOURCE_DIR}/)
+  target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${asio_SOURCE_DIR}/)
 endif()
 
 target_sources(
diff --git a/cmake/FetchWebsocketpp.cmake b/cmake/FetchWebsocketpp.cmake
index 1c60239..24d6451 100644
--- a/cmake/FetchWebsocketpp.cmake
+++ b/cmake/FetchWebsocketpp.cmake
@@ -1,9 +1,16 @@
-include(FetchContent)
-
-FetchContent_Declare(
-  websocketpp
-  URL https://github.com/zaphoyd/websocketpp/archive/refs/tags/0.8.2.tar.gz
-  URL_HASH SHA256=6ce889d85ecdc2d8fa07408d6787e7352510750daa66b5ad44aacb47bea76755
-)
-
-FetchContent_MakeAvailable(websocketpp)
+include(FetchContent)
+
+FetchContent_Declare(
+  websocketpp
+  URL https://github.com/zaphoyd/websocketpp/archive/refs/tags/0.8.2.tar.gz
+  URL_HASH SHA256=6ce889d85ecdc2d8fa07408d6787e7352510750daa66b5ad44aacb47bea76755)
+
+FetchContent_MakeAvailable(websocketpp)
+
+# Fetch ASIO
+FetchContent_Declare(
+  asio
+  URL https://github.com/chriskohlhoff/asio/archive/asio-1-28-0.tar.gz
+  URL_HASH SHA256=1ef87b17e5e32f1a1b4cd840acac6c2a8d0dcde365dde3f9dcd5d1eae0495290)
+
+FetchContent_MakeAvailable(websocketpp asio)
diff --git a/cmake/linux/compilerconfig.cmake b/cmake/linux/compilerconfig.cmake
index 647c4b3..8931ba3 100644
--- a/cmake/linux/compilerconfig.cmake
+++ b/cmake/linux/compilerconfig.cmake
@@ -21,6 +21,7 @@ set(_obs_gcc_c_options
     -Wformat-security
     -Wno-conversion
     -Wno-deprecated-declarations
+    -Wno-error=conversion
     -Wno-error=deprecated-declarations
     -Wno-float-conversion
     -Wno-implicit-fallthrough
@@ -42,14 +43,13 @@ set(_obs_gcc_c_options
     -Wvla)
 
 # gcc options for C++
-set(_obs_gcc_cxx_options
-    # cmake-format: sortable
-    ${_obs_gcc_c_options} -Wconversion -Wfloat-conversion -Winvalid-offsetof -Wno-overloaded-virtual)
+set(_obs_gcc_cxx_options # cmake-format: sortable
+                         ${_obs_gcc_c_options} -Winvalid-offsetof -Wno-overloaded-virtual)
 
 add_compile_options(
   -fopenmp-simd
   "$<$<COMPILE_LANG_AND_ID:C,GNU>:${_obs_gcc_c_options}>"
-  "$<$<COMPILE_LANG_AND_ID:C,GNU>:-Wint-conversion;-Wno-missing-prototypes;-Wno-strict-prototypes;-Wpointer-sign>"
+  "$<$<COMPILE_LANG_AND_ID:C,GNU>:-Wno-missing-prototypes;-Wno-strict-prototypes;-Wpointer-sign>"
   "$<$<COMPILE_LANG_AND_ID:CXX,GNU>:${_obs_gcc_cxx_options}>"
   "$<$<COMPILE_LANG_AND_ID:C,Clang>:${_obs_clang_c_options}>"
   "$<$<COMPILE_LANG_AND_ID:CXX,Clang>:${_obs_clang_cxx_options}>")
diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp
index c11ac80..f5a57a9 100644
--- a/src/stenographer/stenographer.cpp
+++ b/src/stenographer/stenographer.cpp
@@ -8,9 +8,7 @@
 #define ASIO_STANDALONE
 #define _WEBSOCKETPP_CPP11_TYPE_TRAITS_
 
-#ifndef __linux__
 #include <websocketpp/config/asio_no_tls.hpp>
-#endif
 #include <websocketpp/server.hpp>
 #include <nlohmann/json.hpp>
 #include <queue>

From 6ccc44b9f95ccb8d029f4cde7d5500e0e4fab240 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 10 Oct 2024 11:08:29 -0400
Subject: [PATCH 12/20] Update asio URL_HASH in FetchWebsocketpp.cmake

---
 cmake/FetchWebsocketpp.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/FetchWebsocketpp.cmake b/cmake/FetchWebsocketpp.cmake
index 24d6451..1cb63d7 100644
--- a/cmake/FetchWebsocketpp.cmake
+++ b/cmake/FetchWebsocketpp.cmake
@@ -11,6 +11,6 @@ FetchContent_MakeAvailable(websocketpp)
 FetchContent_Declare(
   asio
   URL https://github.com/chriskohlhoff/asio/archive/asio-1-28-0.tar.gz
-  URL_HASH SHA256=1ef87b17e5e32f1a1b4cd840acac6c2a8d0dcde365dde3f9dcd5d1eae0495290)
+  URL_HASH SHA256=226438b0798099ad2a202563a83571ce06dd13b570d8fded4840dbc1f97fa328)
 
 FetchContent_MakeAvailable(websocketpp asio)

From 283bf341675dbff9f49f43d4a51e172fba749be1 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 10 Oct 2024 11:10:12 -0400
Subject: [PATCH 13/20] Refactor CMakeLists.txt to include the correct path for
 asio library

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c4890c..f86be5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,7 +124,7 @@ if(WIN32 OR APPLE)
 else()
   include(cmake/FetchWebsocketpp.cmake)
   target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${websocketpp_SOURCE_DIR}/)
-  target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${asio_SOURCE_DIR}/)
+  target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${asio_SOURCE_DIR}/asio/include/)
 endif()
 
 target_sources(

From f5dc4c89d705f93a99ef1cbbca521e060bad1692 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 10 Oct 2024 11:12:40 -0400
Subject: [PATCH 14/20] Refactor WebSocket server initialization and
 communication in stenographer.cpp

---
 src/stenographer/stenographer.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp
index f5a57a9..95550eb 100644
--- a/src/stenographer/stenographer.cpp
+++ b/src/stenographer/stenographer.cpp
@@ -45,14 +45,14 @@ class TranscriptionHandler::Impl {
 		  messageCallback(callback),
 		  running(false)
 	{
-		server.init_asio();
+		wsServer.init_asio();
 
-		server.set_open_handler([this](websocketpp::connection_hdl hdl) {
+		wsServer.set_open_handler([this](websocketpp::connection_hdl hdl) {
 			std::lock_guard<std::mutex> lock(mutex);
 			connection = hdl;
 		});
 
-		server.set_message_handler(
+		wsServer.set_message_handler(
 			[this](websocketpp::connection_hdl hdl, server::message_ptr msg) {
 				UNUSED_PARAMETER(hdl);
 				handleIncomingMessage(msg->get_payload());
@@ -69,9 +69,9 @@ class TranscriptionHandler::Impl {
 		if (!running) {
 			running = true;
 			serverThread = std::async(std::launch::async, [this]() {
-				server.listen(9002);
-				server.start_accept();
-				server.run();
+				wsServer.listen(9002);
+				wsServer.start_accept();
+				wsServer.run();
 			});
 
 			processingThread =
@@ -83,7 +83,7 @@ class TranscriptionHandler::Impl {
 	{
 		if (running) {
 			running = false;
-			server.stop();
+			wsServer.stop();
 			if (serverThread.valid())
 				serverThread.wait();
 			if (processingThread.valid())
@@ -93,7 +93,7 @@ class TranscriptionHandler::Impl {
 
 private:
 	transcription_filter_data *gf;
-	server server;
+	server wsServer;
 	websocketpp::connection_hdl connection;
 	MessageCallback messageCallback;
 	std::queue<std::vector<int16_t>> audioQueue;
@@ -149,8 +149,8 @@ class TranscriptionHandler::Impl {
 						       start_timestamp_offset_ns},
 						      {"end_timestamp", end_timestamp_offset_ns}};
 				if (connection.lock()) {
-					server.send(connection, timestampInfo.dump(),
-						    websocketpp::frame::opcode::text);
+					wsServer.send(connection, timestampInfo.dump(),
+						      websocketpp::frame::opcode::text);
 				}
 				sendAudioData(pcmData);
 			} else {
@@ -192,8 +192,8 @@ class TranscriptionHandler::Impl {
 				std::memcpy(wavData.data() + sizeof(WAVHeader), audioBuffer.data(),
 					    wavHeader.data_size);
 
-				server.send(connection, wavData.data(), wavData.size(),
-					    websocketpp::frame::opcode::binary);
+				wsServer.send(connection, wavData.data(), wavData.size(),
+					      websocketpp::frame::opcode::binary);
 
 				audioBuffer.clear();
 			}

From e979fca72f2c9ee4d3e4a52e60a853f8906ddb73 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 10 Oct 2024 14:58:42 -0400
Subject: [PATCH 15/20] Refactor stenographer-util.h and stenographer-util.cpp

---
 CMakeLists.txt                               |  3 +-
 src/stenographer/stenographer-util.cpp       | 67 ++++++++++++++++++++
 src/stenographer/stenographer-util.h         | 10 +++
 src/stenographer/stenographer_interface.html | 24 ++++---
 src/transcription-filter-callbacks.cpp       |  2 -
 src/transcription-filter-properties.cpp      |  4 +-
 src/transcription-filter.cpp                 | 46 +-------------
 src/transcription-utils.h                    | 28 ++++++++
 8 files changed, 126 insertions(+), 58 deletions(-)
 create mode 100644 src/stenographer/stenographer-util.cpp
 create mode 100644 src/stenographer/stenographer-util.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f86be5b..ab3efb3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,7 +153,8 @@ target_sources(
           src/ui/filter-replace-utils.cpp
           src/translation/translation-language-utils.cpp
           src/ui/filter-replace-dialog.cpp
-          src/stenographer/stenographer.cpp)
+          src/stenographer/stenographer.cpp
+          src/stenographer/stenographer-util.cpp)
 
 set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name})
 
diff --git a/src/stenographer/stenographer-util.cpp b/src/stenographer/stenographer-util.cpp
new file mode 100644
index 0000000..59278cb
--- /dev/null
+++ b/src/stenographer/stenographer-util.cpp
@@ -0,0 +1,67 @@
+
+#include "stenographer-util.h"
+#include "transcription-filter-data.h"
+#include "transcription-utils.h"
+
+#include <obs.h>
+
+#include <cstring>
+#include <vector>
+
+/**
+ * @brief Applies a simple delay to the audio data for stenographer mode.
+ *
+ * This function stores the incoming audio data in a buffer and processes it after a specified delay.
+ * The delayed audio data is then emitted, replacing the original audio data in the buffer.
+ * If the buffer does not yet contain enough data to satisfy the delay, the audio buffer is filled with silence.
+ *
+ * @param gf Pointer to the transcription filter data structure containing the delay buffer and configuration.
+ * @param audio Pointer to the audio data structure containing the audio frames to be processed.
+ * @return Pointer to the processed audio data structure with the applied delay.
+ */
+struct obs_audio_data *stenographer_simple_delay(transcription_filter_data *gf,
+						 struct obs_audio_data *audio)
+{
+	// Stenographer mode - apply delay.
+	// Store the audio data in a buffer and process it after the delay.
+	// push the data to the back of gf->stenographer_delay_buffer
+	for (size_t c = 0; c < gf->channels; c++) {
+		// take a audio->frames * sizeof(float) bytes chunk from audio->data[c] and push it
+		// to the back of the buffer as a float
+		std::vector<float> audio_data_chunk((float *)audio->data[c],
+						    ((float *)audio->data[c]) + audio->frames);
+		gf->stenographer_delay_buffers[c].insert(gf->stenographer_delay_buffers[c].end(),
+							 audio_data_chunk.begin(),
+							 audio_data_chunk.end());
+	}
+
+	// If the buffer is larger than the delay, emit the oldest data
+	// Take from the buffer as much as requested by the incoming audio data
+	size_t delay_frames =
+		(size_t)((float)gf->sample_rate * (float)gf->stenographer_delay_ms / 1000.0f) +
+		audio->frames;
+
+	if (gf->stenographer_delay_buffers[0].size() >= delay_frames) {
+		// Replace data on the audio buffer with the delayed data
+		for (size_t c = 0; c < gf->channels; c++) {
+			// take exatcly audio->frames from the buffer
+			std::vector<float> audio_data(gf->stenographer_delay_buffers[c].begin(),
+						      gf->stenographer_delay_buffers[c].begin() +
+							      audio->frames);
+			// remove the oldest buffers from the delay buffer
+			gf->stenographer_delay_buffers[c].erase(
+				gf->stenographer_delay_buffers[c].begin(),
+				gf->stenographer_delay_buffers[c].begin() + audio->frames);
+
+			// replace the data on the audio buffer with the delayed data
+			memcpy(audio->data[c], audio_data.data(),
+			       audio_data.size() * sizeof(float));
+		}
+	} else {
+		// Fill the audio buffer with silence
+		for (size_t c = 0; c < gf->channels; c++) {
+			memset(audio->data[c], 0, audio->frames * sizeof(float));
+		}
+	}
+	return audio;
+}
diff --git a/src/stenographer/stenographer-util.h b/src/stenographer/stenographer-util.h
new file mode 100644
index 0000000..3d0fd27
--- /dev/null
+++ b/src/stenographer/stenographer-util.h
@@ -0,0 +1,10 @@
+#ifndef STENOGRAPHER_UTIL_H
+#define STENOGRAPHER_UTIL_H
+
+struct transcription_filter_data;
+struct obs_audio_data;
+
+struct obs_audio_data *stenographer_simple_delay(transcription_filter_data *gf,
+						 struct obs_audio_data *audio);
+
+#endif /* STENOGRAPHER_UTIL_H */
\ No newline at end of file
diff --git a/src/stenographer/stenographer_interface.html b/src/stenographer/stenographer_interface.html
index 0b5eea8..c6ca386 100644
--- a/src/stenographer/stenographer_interface.html
+++ b/src/stenographer/stenographer_interface.html
@@ -15,14 +15,18 @@
 </head>
 <body>
     <h1>Stenographer Interface</h1>
-    <button id="startButton">Start Audio</button>
+    <div>
+        <input type="text" id="wsUrl" value="ws://localhost:9002" placeholder="Enter WebSocket URL">
+        <button id="startButton">Start Audio</button>
+    </div>
     <canvas id="visualizer"></canvas>
-    <textarea id="captionInput" placeholder="Enter captions here..."></textarea>
+    <div>Timestamp (s): <span id="timestamp"></span></div>
+    <textarea id="captionInput" placeholder="Enter captions here... (Enter to commit)"></textarea>
     <div id="connectionStatus">Connection Status: <span id="statusText" class="status-disconnected">Disconnected</span></div>
     <div id="audioStatus">Audio Status: Not started</div>
 
     <script>
-        const WS_URL = 'ws://localhost:9002';
+        const WS_URL = document.getElementById('wsUrl').value;
         let ws;
         const captionInput = document.getElementById('captionInput');
         const audioStatus = document.getElementById('audioStatus');
@@ -41,7 +45,8 @@ <h1>Stenographer Interface</h1>
         visualizer.width = visualizer.offsetWidth;
         visualizer.height = visualizer.offsetHeight;
 
-        let currentStartTimestamp = 0;
+        let currentTimestampNSRelative = 0;
+        let startTimestampNSRelative = 0;
 
         function connectWebSocket() {
             ws = new WebSocket(WS_URL);
@@ -63,7 +68,9 @@ <h1>Stenographer Interface</h1>
                     // This is our timestamp information
                     try {
                         const timestampInfo = JSON.parse(event.data);
-                        currentStartTimestamp = timestampInfo.start_timestamp;
+                        currentTimestampNSRelative = timestampInfo.start_timestamp;
+                        const timestampInSeconds = (currentTimestampNSRelative / 1e9).toFixed(2);
+                        document.getElementById('timestamp').textContent = `${timestampInSeconds} seconds`;
                     } catch (error) {
                         console.error('Error parsing timestamp information:', error);
                     }
@@ -142,17 +149,16 @@ <h1>Stenographer Interface</h1>
 
         function sendCaptionUpdate(type) {
             if (ws.readyState === WebSocket.OPEN) {
-                const now = Date.now(); // Keep in milliseconds
                 const message = JSON.stringify({
                     type: type,
                     text: captionInput.value,
-                    start_timestamp: currentStartTimestamp,
-                    end_timestamp: type === 'sentence' ? now : 0 // 0 for partial sentences
+                    start_timestamp: startTimestampNSRelative,
+                    end_timestamp: currentTimestampNSRelative
                 });
                 ws.send(message);
                 
                 if (type === 'sentence') {
-                    currentStartTimestamp = now; // Reset start timestamp for next sentence
+                    startTimestampNSRelative = currentTimestampNSRelative; // Reset start timestamp for next sentence
                 }
             } else {
                 console.warn('Cannot send caption update: WebSocket is not connected');
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index 9d14376..c434e85 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -213,9 +213,7 @@ void set_text_callback(struct transcription_filter_data *gf,
 		str_copy = fix_utf8(str_copy);
 	} else {
 		// only remove leading and trailing non-alphanumeric characters if the output is English
-		obs_log(LOG_INFO, "before: %s", str_copy.c_str());
 		str_copy = remove_leading_trailing_nonalpha(str_copy);
-		obs_log(LOG_INFO, "after: %s", str_copy.c_str());
 	}
 
 	// if suppression is enabled, check if the text is in the suppression list
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index 332173b..a96b600 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -515,7 +515,7 @@ void add_stenographer_group_properties(obs_properties_t *ppts)
 
 	// add delay amount for partial transcription
 	obs_properties_add_int_slider(stenographer_group, "stenographer_delay",
-				      MT_("stenographer_delay"), 1000, 12000, 100);
+				      MT_("stenographer_delay"), 0, 12000, 100);
 }
 
 void add_partial_group_properties(obs_properties_t *ppts)
@@ -610,7 +610,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_bool(s, "partial_group", false);
 	obs_data_set_default_int(s, "partial_latency", 1100);
 	obs_data_set_default_bool(s, "stenographer_group", false);
-	obs_data_set_default_int(s, "stenographer_delay", 10000);
+	obs_data_set_default_int(s, "stenographer_delay", 3000);
 
 	// translation options
 	obs_data_set_default_double(s, "translation_sampling_temperature", 0.1);
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 25cbba9..4a3628a 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -31,6 +31,7 @@
 #include "translation/translation-includes.h"
 #include "ui/filter-replace-dialog.h"
 #include "ui/filter-replace-utils.h"
+#include "stenographer/stenographer-util.h"
 
 void set_source_signals(transcription_filter_data *gf, obs_source_t *parent_source)
 {
@@ -116,50 +117,7 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 
 	if (gf->stenographer_enabled) {
 		// Stenographer mode - apply delay.
-		// Store the audio data in a buffer and process it after the delay.
-		// push the data to the back of gf->stenographer_delay_buffer
-		for (size_t c = 0; c < gf->channels; c++) {
-			// take a audio->frames * sizeof(float) bytes chunk from audio->data[c] and push it
-			// to the back of the buffer as a float
-			std::vector<float> audio_data_chunk(
-				(float *)audio->data[c], ((float *)audio->data[c]) + audio->frames);
-			gf->stenographer_delay_buffers[c].insert(
-				gf->stenographer_delay_buffers[c].end(), audio_data_chunk.begin(),
-				audio_data_chunk.end());
-		}
-
-		// If the buffer is larger than the delay, emit the oldest data
-		// Take from the buffer as much as requested by the incoming audio data
-		size_t delay_frames = (size_t)((float)gf->sample_rate *
-					       (float)gf->stenographer_delay_ms / 1000.0f) +
-				      audio->frames;
-		if (gf->stenographer_delay_buffers[0].size() >= delay_frames) {
-			obs_log(LOG_INFO,
-				"Stenographer delay buffer filled %lu/%lu. Sending %lu frames",
-				gf->stenographer_delay_buffers[0].size(), delay_frames,
-				audio->frames);
-			// Replace data on the audio buffer with the delayed data
-			for (size_t c = 0; c < gf->channels; c++) {
-				// Take the oldest audio->frames from the buffer and put it in the audio buffer
-				// as bytes
-				std::vector<float> audio_data_chunk(
-					gf->stenographer_delay_buffers[c].begin(),
-					gf->stenographer_delay_buffers[c].begin() + audio->frames);
-				memcpy(audio->data[c], audio_data_chunk.data(),
-				       audio->frames * sizeof(float));
-				// Remove the oldest audio->frames from the buffer
-				gf->stenographer_delay_buffers[c].erase(
-					gf->stenographer_delay_buffers[c].begin(),
-					gf->stenographer_delay_buffers[c].begin() + audio->frames);
-			}
-		} else {
-			obs_log(LOG_INFO, "Stenographer delay buffer not filled yet %lu/%lu",
-				gf->stenographer_delay_buffers[0].size(), delay_frames);
-			// Fill the audio buffer with silence
-			for (size_t c = 0; c < gf->channels; c++) {
-				memset(audio->data[c], 0, audio->frames * sizeof(float));
-			}
-		}
+		return stenographer_simple_delay(gf, audio);
 	}
 
 	return audio;
diff --git a/src/transcription-utils.h b/src/transcription-utils.h
index 5fdd0cf..437cd6d 100644
--- a/src/transcription-utils.h
+++ b/src/transcription-utils.h
@@ -32,6 +32,34 @@ inline uint64_t now_ns()
 		.count();
 }
 
+/**
+ * @brief Calculates the elapsed time in nanoseconds since a given start time.
+ *
+ * This function takes a starting time in nanoseconds and returns the 
+ * difference between the current time and the starting time.
+ *
+ * @param start_ns The starting time in nanoseconds.
+ * @return The elapsed time in nanoseconds since the start time.
+ */
+inline uint64_t ns_since(uint64_t start_ns)
+{
+	return now_ns() - start_ns;
+}
+
+/**
+ * @brief Calculates the elapsed time in milliseconds since a given start time.
+ *
+ * This function takes a start time in milliseconds and returns the difference
+ * between the current time (in milliseconds) and the start time.
+ *
+ * @param start_ms The start time in milliseconds.
+ * @return The elapsed time in milliseconds since the start time.
+ */
+inline uint64_t ms_since(uint64_t start_ms)
+{
+	return now_ms() - start_ms;
+}
+
 // Split a string into words based on spaces
 std::vector<std::string> split_words(const std::string &str_copy);
 

From f173aa879d8f9e4ca1a7a8a11c0513c1c1d16aed Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Thu, 10 Oct 2024 16:01:28 -0400
Subject: [PATCH 16/20] Fix wspp cmake

---
 CMakeLists.txt               |  2 +-
 cmake/FetchWebsocketpp.cmake | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab3efb3..6784e0f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -123,7 +123,7 @@ if(WIN32 OR APPLE)
   target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE "${deps_root}/include")
 else()
   include(cmake/FetchWebsocketpp.cmake)
-  target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${websocketpp_SOURCE_DIR}/)
+  target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE websocketpp)
   target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${asio_SOURCE_DIR}/asio/include/)
 endif()
 
diff --git a/cmake/FetchWebsocketpp.cmake b/cmake/FetchWebsocketpp.cmake
index 1cb63d7..4ee7d46 100644
--- a/cmake/FetchWebsocketpp.cmake
+++ b/cmake/FetchWebsocketpp.cmake
@@ -5,7 +5,15 @@ FetchContent_Declare(
   URL https://github.com/zaphoyd/websocketpp/archive/refs/tags/0.8.2.tar.gz
   URL_HASH SHA256=6ce889d85ecdc2d8fa07408d6787e7352510750daa66b5ad44aacb47bea76755)
 
-FetchContent_MakeAvailable(websocketpp)
+# Only download the content, don't configure or build it
+FetchContent_GetProperties(websocketpp)
+if(NOT websocketpp_POPULATED)
+  FetchContent_Populate(websocketpp)
+endif()
+
+# Add WebSocket++ as an interface library
+add_library(websocketpp INTERFACE)
+target_include_directories(websocketpp INTERFACE ${websocketpp_SOURCE_DIR})
 
 # Fetch ASIO
 FetchContent_Declare(

From fe8875b42b6b4dc17019dd9b5069ac6f95781509 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 14 Oct 2024 10:26:12 -0400
Subject: [PATCH 17/20] Refactor translation.cpp to remove commented code and
 improve detokenization logic

---
 src/translation/translation.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/translation/translation.cpp b/src/translation/translation.cpp
index 0701d95..2c7f0bb 100644
--- a/src/translation/translation.cpp
+++ b/src/translation/translation.cpp
@@ -203,7 +203,8 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 
 		// detokenize
 		const std::string result_ = translation_ctx.detokenizer(translation_tokens);
-		result = remove_start_punctuation(result_);
+		// result = remove_start_punctuation(result_);
+		result = result_;
 	} catch (std::exception &e) {
 		obs_log(LOG_ERROR, "Error: %s", e.what());
 		return OBS_POLYGLOT_TRANSLATION_FAIL;

From 5d441fcf90e096fa5bb6aceda1e9e0909ac8b441 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 14 Oct 2024 11:15:54 -0400
Subject: [PATCH 18/20] Refactor translation.cpp to improve detokenization
 logic and handle translation context

- Remove commented code and improve detokenization logic in translation.cpp
- Add support for translation context by adding last input tokens and last translation tokens
- Add option to remove punctuation from the start of translations
---
 src/transcription-filter-callbacks.cpp  |  4 +-
 src/transcription-filter-properties.cpp |  4 ++
 src/transcription-filter.cpp            |  5 +-
 src/translation/translation.cpp         | 69 ++++++++++++++++++-------
 src/translation/translation.h           |  2 +
 src/whisper-utils/resample-utils.cpp    |  7 ++-
 6 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
index ec6e791..0b9a8b2 100644
--- a/src/transcription-filter-callbacks.cpp
+++ b/src/transcription-filter-callbacks.cpp
@@ -239,8 +239,10 @@ void set_text_callback(struct transcription_filter_data *gf,
 		gf->translate_only_full_sentences ? result.result == DETECTION_RESULT_SPEECH : true;
 
 	// send the sentence to translation (if enabled)
+	std::string source_language = result.language.empty() ? gf->whisper_params.language
+							      : result.language;
 	std::string translated_sentence =
-		should_translate ? send_sentence_to_translation(str_copy, gf, result.language) : "";
+		should_translate ? send_sentence_to_translation(str_copy, gf, source_language) : "";
 
 	if (gf->translate) {
 		if (gf->translation_output == "none") {
diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp
index a85a742..434c63c 100644
--- a/src/transcription-filter-properties.cpp
+++ b/src/transcription-filter-properties.cpp
@@ -259,6 +259,9 @@ void add_translation_group_properties(obs_properties_t *ppts)
 				      MT_("translation_max_input_length"), 1, 100, 5);
 	obs_properties_add_int_slider(translation_group, "translation_no_repeat_ngram_size",
 				      MT_("translation_no_repeat_ngram_size"), 1, 10, 1);
+	// add remove_punctuation_from_start boolean
+	obs_properties_add_bool(translation_group, "translation_remove_punctuation_from_start",
+				MT_("translation_remove_punctuation_from_start"));
 }
 
 void add_file_output_group_properties(obs_properties_t *ppts)
@@ -619,6 +622,7 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_int(s, "translation_max_decoding_length", 65);
 	obs_data_set_default_int(s, "translation_no_repeat_ngram_size", 1);
 	obs_data_set_default_int(s, "translation_max_input_length", 65);
+	obs_data_set_default_bool(s, "translation_remove_punctuation_from_start", false);
 
 	// Whisper parameters
 	obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH);
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index e23974a..93e59bd 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -104,7 +104,7 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 			circlebuf_push_back(&gf->input_buffers[c], audio->data[c],
 					    audio->frames * sizeof(float));
 		}
-		obs_log(gf->log_level, "currently %lu bytes in the audio input buffer",
+		obs_log(LOG_DEBUG, "currently %lu bytes in the audio input buffer",
 			gf->input_buffers[0].size);
 		// push audio packet info (timestamp/frame count) to info circlebuf
 		struct transcription_filter_audio_info info = {0};
@@ -305,6 +305,9 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	std::string new_translate_model_index = obs_data_get_string(s, "translate_model");
 	std::string new_translation_model_path_external =
 		obs_data_get_string(s, "translation_model_path_external");
+	gf->translation_ctx.remove_punctuation_from_start =
+		obs_data_get_bool(s, "translation_remove_punctuation_from_start");
+	gf->translation_ctx.log_level = gf->log_level;
 
 	if (new_translate) {
 		if (new_translate != gf->translate ||
diff --git a/src/translation/translation.cpp b/src/translation/translation.cpp
index 2c7f0bb..c495d0e 100644
--- a/src/translation/translation.cpp
+++ b/src/translation/translation.cpp
@@ -117,6 +117,9 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 			std::vector<std::string> input_tokens = {source_lang, "<s>"};
 			if (translation_ctx.add_context > 0 &&
 			    translation_ctx.last_input_tokens.size() > 0) {
+				obs_log(translation_ctx.log_level,
+					"Adding last input tokens to input tokens, size: %d",
+					(int)translation_ctx.last_input_tokens.size());
 				// add the last input tokens sentences to the input tokens
 				for (const auto &tokens : translation_ctx.last_input_tokens) {
 					input_tokens.insert(input_tokens.end(), tokens.begin(),
@@ -133,13 +136,24 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 			for (const auto &token : input_tokens) {
 				input_tokens_str += token + ", ";
 			}
-			obs_log(LOG_INFO, "Input tokens: %s", input_tokens_str.c_str());
-
-			translation_ctx.last_input_tokens.push_back(new_input_tokens);
-			// remove the oldest input tokens
-			while (translation_ctx.last_input_tokens.size() >
-			       (size_t)translation_ctx.add_context) {
-				translation_ctx.last_input_tokens.pop_front();
+			obs_log(translation_ctx.log_level, "Input tokens: %s",
+				input_tokens_str.c_str());
+
+			if (translation_ctx.add_context > 0) {
+				translation_ctx.last_input_tokens.push_back(new_input_tokens);
+				obs_log(translation_ctx.log_level,
+					"Adding last input context. Last input tokens deque size: %d",
+					(int)translation_ctx.last_input_tokens.size());
+				// remove the oldest input tokens
+				while (translation_ctx.last_input_tokens.size() >
+				       (size_t)translation_ctx.add_context) {
+					obs_log(translation_ctx.log_level,
+						"Removing oldest input tokens context, size: %d",
+						(int)translation_ctx.last_input_tokens.size());
+					translation_ctx.last_input_tokens.pop_front();
+				}
+			} else {
+				translation_ctx.last_input_tokens.clear();
 			}
 
 			const std::vector<std::vector<std::string>> batch = {input_tokens};
@@ -149,6 +163,9 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 			// add the last translation tokens to the target prefix
 			if (translation_ctx.add_context > 0 &&
 			    translation_ctx.last_translation_tokens.size() > 0) {
+				obs_log(translation_ctx.log_level,
+					"Adding last translation tokens to target prefix, size: %d",
+					(int)translation_ctx.last_translation_tokens.size());
 				for (const auto &tokens : translation_ctx.last_translation_tokens) {
 					target_prefix.insert(target_prefix.end(), tokens.begin(),
 							     tokens.end());
@@ -160,7 +177,8 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 			for (const auto &token : target_prefix) {
 				target_prefix_str += token + ",";
 			}
-			obs_log(LOG_INFO, "Target prefix: %s", target_prefix_str.c_str());
+			obs_log(translation_ctx.log_level, "Target prefix: %s",
+				target_prefix_str.c_str());
 
 			const std::vector<std::vector<std::string>> target_prefix_batch = {
 				target_prefix};
@@ -189,22 +207,33 @@ int translate(struct translation_context &translation_ctx, const std::string &te
 		for (const auto &token : translation_tokens) {
 			translation_tokens_str += token + ", ";
 		}
-		obs_log(LOG_INFO, "Translation tokens: %s", translation_tokens_str.c_str());
-
-		// save the translation tokens
-		translation_ctx.last_translation_tokens.push_back(translation_tokens);
-		// remove the oldest translation tokens
-		while (translation_ctx.last_translation_tokens.size() >
-		       (size_t)translation_ctx.add_context) {
-			translation_ctx.last_translation_tokens.pop_front();
+		obs_log(translation_ctx.log_level, "Translation tokens: %s",
+			translation_tokens_str.c_str());
+
+		if (translation_ctx.add_context > 0) {
+			// save the translation tokens
+			translation_ctx.last_translation_tokens.push_back(translation_tokens);
+			// remove the oldest translation tokens
+			while (translation_ctx.last_translation_tokens.size() >
+			       (size_t)translation_ctx.add_context) {
+				obs_log(translation_ctx.log_level,
+					"Removing oldest translation tokens context, size: %d",
+					(int)translation_ctx.last_translation_tokens.size());
+				translation_ctx.last_translation_tokens.pop_front();
+			}
+			obs_log(translation_ctx.log_level, "Last translation tokens deque size: %d",
+				(int)translation_ctx.last_translation_tokens.size());
+		} else {
+			translation_ctx.last_translation_tokens.clear();
 		}
-		obs_log(LOG_INFO, "Last translation tokens deque size: %d",
-			(int)translation_ctx.last_translation_tokens.size());
 
 		// detokenize
 		const std::string result_ = translation_ctx.detokenizer(translation_tokens);
-		// result = remove_start_punctuation(result_);
-		result = result_;
+		if (translation_ctx.remove_punctuation_from_start) {
+			result = remove_start_punctuation(result_);
+		} else {
+			result = result_;
+		}
 	} catch (std::exception &e) {
 		obs_log(LOG_ERROR, "Error: %s", e.what());
 		return OBS_POLYGLOT_TRANSLATION_FAIL;
diff --git a/src/translation/translation.h b/src/translation/translation.h
index c740726..adbf4b2 100644
--- a/src/translation/translation.h
+++ b/src/translation/translation.h
@@ -31,6 +31,8 @@ struct translation_context {
 	// How many sentences to use as context for the next translation
 	int add_context;
 	InputTokenizationStyle input_tokenization_style;
+	bool remove_punctuation_from_start;
+	int log_level = 400;
 };
 
 int build_translation_context(struct translation_context &translation_ctx);
diff --git a/src/whisper-utils/resample-utils.cpp b/src/whisper-utils/resample-utils.cpp
index 7533b61..2de5edc 100644
--- a/src/whisper-utils/resample-utils.cpp
+++ b/src/whisper-utils/resample-utils.cpp
@@ -16,8 +16,7 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
 			return 1;
 		}
 
-		obs_log(gf->log_level,
-			"segmentation: currently %lu bytes in the audio input buffer",
+		obs_log(LOG_DEBUG, "segmentation: currently %lu bytes in the audio input buffer",
 			gf->input_buffers[0].size);
 
 		// max number of frames is 10 seconds worth of audio
@@ -68,7 +67,7 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
 		}
 	}
 
-	obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos);
+	obs_log(LOG_DEBUG, "found %d frames from info buffer.", num_frames_from_infos);
 	gf->last_num_frames = num_frames_from_infos;
 
 	{
@@ -87,7 +86,7 @@ int get_data_from_buf_and_resample(transcription_filter_data *gf,
 
 		circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0],
 				    resampled_16khz_frames * sizeof(float));
-		obs_log(gf->log_level,
+		obs_log(LOG_DEBUG,
 			"resampled: %d channels, %d frames, %f ms, current size: %lu bytes",
 			(int)gf->channels, (int)resampled_16khz_frames,
 			(float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f,

From a6b670e27b462ac8c8be299df2201a5c5abb705e Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 14 Oct 2024 11:16:00 -0400
Subject: [PATCH 19/20] Refactor en-US.ini to add
 translation_remove_punctuation_from_start option

---
 data/locale/en-US.ini | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
index 97d0835..338f880 100644
--- a/data/locale/en-US.ini
+++ b/data/locale/en-US.ini
@@ -89,4 +89,5 @@ translate_only_full_sentences="Translate only full sentences"
 duration_filter_threshold="Duration filter"
 segment_duration="Segment duration"
 stenographer_parameters="Stenographer Options"
-stenographer_delay="Audio Delay"
\ No newline at end of file
+stenographer_delay="Audio Delay"
+translation_remove_punctuation_from_start="Remove punctuation from sentence start"
\ No newline at end of file

From 704ca5d06e82292f130552de6fb5d4bb6ba56f99 Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 14 Oct 2024 12:46:23 -0400
Subject: [PATCH 20/20] Refactor stenographer_interface.html and
 transcription-filter.cpp

---
 src/stenographer/stenographer_interface.html | 11 +++++++----
 src/transcription-filter.cpp                 |  4 ++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/stenographer/stenographer_interface.html b/src/stenographer/stenographer_interface.html
index c6ca386..c9f99b0 100644
--- a/src/stenographer/stenographer_interface.html
+++ b/src/stenographer/stenographer_interface.html
@@ -25,6 +25,7 @@ <h1>Stenographer Interface</h1>
     <div id="connectionStatus">Connection Status: <span id="statusText" class="status-disconnected">Disconnected</span></div>
     <div id="audioStatus">Audio Status: Not started</div>
 
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.21/lodash.min.js"></script>
     <script>
         const WS_URL = document.getElementById('wsUrl').value;
         let ws;
@@ -136,16 +137,18 @@ <h1>Stenographer Interface</h1>
         }
 
         captionInput.addEventListener('input', () => {
-            sendCaptionUpdate('partial');
+            if (captionInput.value.trim() !== '') {
+                sendCaptionUpdate('partial');
+            }
         });
 
-        captionInput.addEventListener('keydown', (event) => {
-            if (event.key === 'Enter') {
+        captionInput.addEventListener('keydown', _.debounce((event) => {
+            if (event.key === 'Enter' && captionInput.value.trim() !== '') {
                 event.preventDefault();
                 sendCaptionUpdate('sentence');
                 captionInput.value = '';
             }
-        });
+        }, 100));
 
         function sendCaptionUpdate(type) {
             if (ws.readyState === WebSocket.OPEN) {
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
index 93e59bd..88cbe28 100644
--- a/src/transcription-filter.cpp
+++ b/src/transcription-filter.cpp
@@ -443,7 +443,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 			}
 		}
 	} else {
-		obs_log(LOG_INFO, "Filter not enabled, not updating whisper model.");
+		obs_log(LOG_INFO, "Transcription not enabled, not updating whisper model.");
 	}
 
 	if (new_stenographer_enabled != gf->stenographer_enabled) {
@@ -467,7 +467,7 @@ void transcription_filter_update(void *data, obs_data_t *s)
 				});
 			gf->transcription_handler->start();
 		} else {
-			obs_log(gf->log_level, "Stenographer disabled");
+			obs_log(gf->log_level, "Stenographer disabled, restarting whisper");
 			if (gf->transcription_handler) {
 				gf->transcription_handler->stop();
 				delete gf->transcription_handler;