From a9120a6fceed13de7577b2dce76964bb696bb29c Mon Sep 17 00:00:00 2001
From: Ruwen Hahn <haruwenz@twitch.tv>
Date: Tue, 30 Jul 2024 18:16:48 +0200
Subject: [PATCH 1/9] look at the front of the whisper buffer instead of the
 back

this should mostly not make a difference, but feels semantically
more correct
---
 src/whisper-utils/whisper-processing.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 7239b04..347d821 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -305,11 +305,11 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
 	float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float));
 	if (vad_state == VAD_STATE_PARTIAL) {
 		// peek instead of pop, since this is a partial run that keeps the data in the buffer
-		circlebuf_peek_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
-				    pcm32f_size * sizeof(float));
+		circlebuf_peek_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
+				     pcm32f_size * sizeof(float));
 	} else {
-		circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
-				   pcm32f_size * sizeof(float));
+		circlebuf_pop_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
+				    pcm32f_size * sizeof(float));
 	}
 
 	struct DetectionResultWithText inference_result =

From 12db51e052570b1b22aa67f2c8af2c47def408b5 Mon Sep 17 00:00:00 2001
From: Ruwen Hahn <haruwenz@twitch.tv>
Date: Tue, 6 Aug 2024 15:16:01 +0200
Subject: [PATCH 2/9] Initialize `resampled_buffer` for offline tests

---
 src/tests/localvocal-offline-test.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
index 444ddce..89b28ea 100644
--- a/src/tests/localvocal-offline-test.cpp
+++ b/src/tests/localvocal-offline-test.cpp
@@ -101,6 +101,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	}
 	circlebuf_init(&gf->info_buffer);
 	circlebuf_init(&gf->whisper_buffer);
+	circlebuf_init(&gf->resampled_buffer);
 
 	// allocate copy buffers
 	gf->copy_buffers[0] =
@@ -307,6 +308,7 @@ void release_context(transcription_filter_data *gf)
 	}
 	circlebuf_free(&gf->info_buffer);
 	circlebuf_free(&gf->whisper_buffer);
+	circlebuf_free(&gf->resampled_buffer);
 
 	delete gf;
 }

From 9ad20fc07726fb170e4582e2778a85c966900eb6 Mon Sep 17 00:00:00 2001
From: Ruwen Hahn <haruwenz@twitch.tv>
Date: Tue, 6 Aug 2024 16:25:39 +0200
Subject: [PATCH 3/9] Read relevant audio bytes

There are two issues here:
1. `line_size` may contain padding (didn't happen in my tests)
2. from: https://git.ffmpeg.org/gitweb/ffmpeg.git/blob/2b5f000d3f6f9e737e918a5438e6c881f65e70e2:/libavutil/frame.h#l405
> For audio, only linesize[0] may be set. For planar audio, each
> channel plane must be the same size.
---
 src/tests/audio-file-utils.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/tests/audio-file-utils.cpp b/src/tests/audio-file-utils.cpp
index 93247be..57322e8 100644
--- a/src/tests/audio-file-utils.cpp
+++ b/src/tests/audio-file-utils.cpp
@@ -108,7 +108,8 @@ read_audio_file(const char *filename, std::function<void(int, int)> initializati
 					for (int j = 0; j < codecContext->channels; j++) {
 						buffer[j].insert(buffer[j].end(), frame->data[j],
 								 frame->data[j] +
-									 frame->linesize[0]);
+									 frame->nb_samples *
+										 sizeof(float));
 					}
 				}
 			}

From befdd6503939c6607431d716fbb38ae13f6011fb Mon Sep 17 00:00:00 2001
From: Ruwen Hahn <haruwenz@twitch.tv>
Date: Tue, 6 Aug 2024 16:27:22 +0200
Subject: [PATCH 4/9] log running time in addition to local time

---
 src/tests/localvocal-offline-test.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
index 89b28ea..2f1fbdb 100644
--- a/src/tests/localvocal-offline-test.cpp
+++ b/src/tests/localvocal-offline-test.cpp
@@ -31,6 +31,7 @@
 
 void obs_log(int log_level, const char *format, ...)
 {
+	static auto start = std::chrono::system_clock::now();
 	if (log_level == LOG_DEBUG) {
 		return;
 	}
@@ -43,9 +44,14 @@ void obs_log(int log_level, const char *format, ...)
 	std::time_t now_time_t = std::chrono::system_clock::to_time_t(now);
 	std::tm now_tm = *std::localtime(&now_time_t);
 
+	auto diff = now - start;
+
 	// print timestamp
-	printf("[%02d:%02d:%02d.%03d] ", now_tm.tm_hour, now_tm.tm_min, now_tm.tm_sec,
-	       (int)(epoch.count() % 1000));
+	printf("[%02d:%02d:%02d.%03d] [%02d:%02lld.%03lld] ", now_tm.tm_hour, now_tm.tm_min,
+	       now_tm.tm_sec, (int)(epoch.count() % 1000),
+	       std::chrono::duration_cast<std::chrono::minutes>(diff).count(),
+	       std::chrono::duration_cast<std::chrono::seconds>(diff).count() % 60,
+	       std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() % 1000);
 
 	// print log level
 	switch (log_level) {

From 407ac470d7cbdb1e44d5f5a0daaee43a56a55a9f Mon Sep 17 00:00:00 2001
From: Ruwen Hahn <haruwenz@twitch.tv>
Date: Tue, 6 Aug 2024 16:29:43 +0200
Subject: [PATCH 5/9] Run whisper test "as fast as possible"

This kind of behaves like libobs, where each chunk of audio is
inspected individually by VAD/whisper, until processing of either
takes longer than the window length, in which case audio continues
to stream in
---
 src/tests/localvocal-offline-test.cpp | 76 ++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 20 deletions(-)

diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
index 2f1fbdb..5887b72 100644
--- a/src/tests/localvocal-offline-test.cpp
+++ b/src/tests/localvocal-offline-test.cpp
@@ -428,19 +428,23 @@ int wmain(int argc, wchar_t *argv[])
 		std::remove("segments.json");
 	}
 
+	const auto window_size_in_ms = std::chrono::milliseconds(25);
+
 	// fill up the whisper buffer
 	{
 		gf->start_timestamp_ms = now_ms();
 
 		obs_log(LOG_INFO, "Sending samples to whisper buffer");
 		// 25 ms worth of frames
-		int frames = gf->sample_rate * 25 / 1000;
+		int frames = gf->sample_rate * window_size_in_ms.count() / 1000;
 		const int frame_size_bytes = sizeof(float);
 		int frames_size_bytes = frames * frame_size_bytes;
 		int frames_count = 0;
 		int64_t start_time = std::chrono::duration_cast<std::chrono::nanoseconds>(
 					     std::chrono::system_clock::now().time_since_epoch())
 					     .count();
+		auto start_time_time = std::chrono::system_clock::now();
+		uint64_t window_number = 0;
 		while (true) {
 			// check if there are enough frames left in the audio buffer
 			if ((frames_count + frames) > (audio[0].size() / frame_size_bytes)) {
@@ -449,31 +453,63 @@ int wmain(int argc, wchar_t *argv[])
 				frames_size_bytes = frames * frame_size_bytes;
 			}
 			{
-				std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
-
-				// push back current audio data to input circlebuf
-				for (size_t c = 0; c < gf->channels; c++) {
-					circlebuf_push_back(&gf->input_buffers[c],
-							    audio[c].data() +
-								    frames_count * frame_size_bytes,
-							    frames_size_bytes);
+				bool wait = false;
+				auto max_wait =
+					start_time_time + (window_number * window_size_in_ms);
+				for (;;) {
+					{
+						std::lock_guard<std::mutex> lock(
+							gf->whisper_buf_mutex);
+						wait = gf->input_buffers->size != 0;
+					}
+					if (!wait)
+						break;
+
+					// sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS
+					auto now = std::chrono::system_clock::now();
+					if (now > max_wait)
+						break;
+
+					auto wait_start = now + std::chrono::milliseconds(1);
+					auto wait_until = max_wait > wait_start ? wait_start
+										: max_wait;
+#if 0
+					obs_log(LOG_INFO, "sleeping %lld ms",
+						std::chrono::duration_cast<std::chrono::milliseconds>(
+							(wait_until -
+							 std::chrono::system_clock::now()))
+							.count());
+#endif
+					std::this_thread::sleep_until(wait_until);
+				}
+
+				{
+					std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
+					// push back current audio data to input circlebuf
+					for (size_t c = 0; c < gf->channels; c++) {
+						circlebuf_push_back(
+							&gf->input_buffers[c],
+							audio[c].data() +
+								frames_count * frame_size_bytes,
+							frames_size_bytes);
+					}
+					// push audio packet info (timestamp/frame count) to info circlebuf
+					struct transcription_filter_audio_info info = {0};
+					info.frames = frames; // number of frames in this packet
+					// make a timestamp from the current position in the audio buffer
+					info.timestamp_offset_ns =
+						start_time + (int64_t)(((float)frames_count /
+									(float)gf->sample_rate) *
+								       1e9);
+					circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
 				}
-				// push audio packet info (timestamp/frame count) to info circlebuf
-				struct transcription_filter_audio_info info = {0};
-				info.frames = frames; // number of frames in this packet
-				// make a timestamp from the current position in the audio buffer
-				info.timestamp_offset_ns =
-					start_time +
-					(int64_t)(((float)frames_count / (float)gf->sample_rate) *
-						  1e9);
-				circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
+				gf->wshiper_thread_cv.notify_one();
 			}
 			frames_count += frames;
+			window_number += 1;
 			if (frames_count >= audio[0].size() / frame_size_bytes) {
 				break;
 			}
-			// sleep for 25 ms
-			std::this_thread::sleep_for(std::chrono::milliseconds(25));
 		}
 		// push a second of silence to the input circlebuf
 		frames = 2 * gf->sample_rate;

From 2c42001c57416c7599e2924f2243106d92b2ee45 Mon Sep 17 00:00:00 2001
From: Ruwen Hahn <haruwenz@twitch.tv>
Date: Tue, 6 Aug 2024 16:42:42 +0200
Subject: [PATCH 6/9] Only ever send a single chunk of audio

---
 src/tests/localvocal-offline-test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
index 5887b72..ad14ef5 100644
--- a/src/tests/localvocal-offline-test.cpp
+++ b/src/tests/localvocal-offline-test.cpp
@@ -467,7 +467,7 @@ int wmain(int argc, wchar_t *argv[])
 
 					// sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS
 					auto now = std::chrono::system_clock::now();
-					if (now > max_wait)
+					if (false && now > max_wait)
 						break;
 
 					auto wait_start = now + std::chrono::milliseconds(1);

From e9581f30793cfd3790bf5e46594e7acee340750c Mon Sep 17 00:00:00 2001
From: Ruwen Hahn <haruwenz@twitch.tv>
Date: Tue, 6 Aug 2024 16:43:58 +0200
Subject: [PATCH 7/9] Add additional files to tests copy command

---
 src/tests/copy_dlls.ps1 | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/tests/copy_dlls.ps1 b/src/tests/copy_dlls.ps1
index c797971..003472f 100644
--- a/src/tests/copy_dlls.ps1
+++ b/src/tests/copy_dlls.ps1
@@ -20,21 +20,23 @@ $obsDlls = @(
     ".\release\Release\obs-plugins\64bit\onnxruntime_providers_shared.dll",
     ".\release\Release\obs-plugins\64bit\onnxruntime.dll",
     ".\release\Release\obs-plugins\64bit\whisper.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avcodec-60.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avdevice-60.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avfilter-9.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avformat-60.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\avutil-58.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\libx264-164.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\swresample-4.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\swscale-7.dll",
-    ".deps\obs-deps-2023-11-03-x64\bin\zlib.dll"
-    ".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
-    ".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
-    ".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
+    ".\release\Release\obs-plugins\64bit\ggml.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avcodec-60.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avdevice-60.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avfilter-9.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avformat-60.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\avutil-58.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\libx264-164.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\swresample-4.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\swscale-7.dll",
+    ".deps\obs-deps-2024-03-19-x64\bin\zlib.dll"
+    ".deps\obs-deps-2024-03-19-x64\bin\librist.dll"
+    ".deps\obs-deps-2024-03-19-x64\bin\srt.dll"
+    ".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
+    ".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
+    ".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
 )
 
 $obsDlls | ForEach-Object {
     Copy-Item -Force -Path $_ -Destination $testToolPath
 }
-

From b5f994fc2b8d7ee9da4fe2a75673fb6071d94254 Mon Sep 17 00:00:00 2001
From: Ruwen Hahn <haruwenz@twitch.tv>
Date: Thu, 8 Aug 2024 13:59:57 +0200
Subject: [PATCH 8/9] Use condition variable to signal input thread if
 available

---
 src/tests/localvocal-offline-test.cpp    | 48 +++++++++---------------
 src/transcription-filter-data.h          |  1 +
 src/whisper-utils/whisper-processing.cpp |  3 ++
 3 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
index ad14ef5..b56a0cd 100644
--- a/src/tests/localvocal-offline-test.cpp
+++ b/src/tests/localvocal-offline-test.cpp
@@ -101,6 +101,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	gf->process_while_muted = false;
 	gf->buffered_output = false;
 	gf->fix_utf8 = true;
+	gf->input_cv.emplace();
 
 	for (size_t i = 0; i < gf->channels; i++) {
 		circlebuf_init(&gf->input_buffers[i]);
@@ -453,38 +454,23 @@ int wmain(int argc, wchar_t *argv[])
 				frames_size_bytes = frames * frame_size_bytes;
 			}
 			{
-				bool wait = false;
-				auto max_wait =
-					start_time_time + (window_number * window_size_in_ms);
-				for (;;) {
-					{
-						std::lock_guard<std::mutex> lock(
-							gf->whisper_buf_mutex);
-						wait = gf->input_buffers->size != 0;
-					}
-					if (!wait)
-						break;
-
-					// sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS
-					auto now = std::chrono::system_clock::now();
-					if (false && now > max_wait)
-						break;
-
-					auto wait_start = now + std::chrono::milliseconds(1);
-					auto wait_until = max_wait > wait_start ? wait_start
-										: max_wait;
-#if 0
-					obs_log(LOG_INFO, "sleeping %lld ms",
-						std::chrono::duration_cast<std::chrono::milliseconds>(
-							(wait_until -
-							 std::chrono::system_clock::now()))
-							.count());
-#endif
-					std::this_thread::sleep_until(wait_until);
-				}
-
 				{
-					std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
+					auto max_wait = start_time_time +
+							(window_number * window_size_in_ms);
+					std::unique_lock<std::mutex> lock(gf->whisper_buf_mutex);
+					for (;;) {
+						// sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS
+						auto now = std::chrono::system_clock::now();
+						if (false && now > max_wait)
+							break;
+
+						gf->input_cv->wait_for(
+							lock, std::chrono::milliseconds(10), [&] {
+								return gf->input_buffers->size == 0;
+							});
+						if (gf->input_buffers->size == 0)
+							break;
+					}
 					// push back current audio data to input circlebuf
 					for (size_t c = 0; c < gf->channels; c++) {
 						circlebuf_push_back(
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
index 8a640f3..4b16d13 100644
--- a/src/transcription-filter-data.h
+++ b/src/transcription-filter-data.h
@@ -104,6 +104,7 @@ struct transcription_filter_data {
 	std::mutex whisper_buf_mutex;
 	std::mutex whisper_ctx_mutex;
 	std::condition_variable wshiper_thread_cv;
+	std::optional<std::condition_variable> input_cv;
 
 	// translation context
 	struct translation_context translation_ctx;
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 347d821..e0f190c 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -599,6 +599,9 @@ void whisper_loop(void *data)
 			}
 		}
 
+		if (gf->input_cv.has_value())
+			gf->input_cv->notify_one();
+
 		// Sleep using the condition variable wshiper_thread_cv
 		// This will wake up the thread if there is new data in the input buffer
 		// or if the whisper context is null

From bb73bcf97e1ea19255c4f0e6b4d656ce21b99215 Mon Sep 17 00:00:00 2001
From: Ruwen Hahn <haruwenz@twitch.tv>
Date: Thu, 8 Aug 2024 14:00:36 +0200
Subject: [PATCH 9/9] Only wait in whisper thread if input buffers are empty

---
 src/whisper-utils/whisper-processing.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index e0f190c..6d2d76e 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -606,7 +606,9 @@ void whisper_loop(void *data)
 		// This will wake up the thread if there is new data in the input buffer
 		// or if the whisper context is null
 		std::unique_lock<std::mutex> lock(gf->whisper_ctx_mutex);
-		gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
+		if (gf->input_buffers->size == 0) {
+			gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
+		}
 	}
 
 	obs_log(gf->log_level, "Exiting whisper thread");