From a9120a6fceed13de7577b2dce76964bb696bb29c Mon Sep 17 00:00:00 2001 From: Ruwen Hahn Date: Tue, 30 Jul 2024 18:16:48 +0200 Subject: [PATCH 1/9] look at the front of the whisper buffer instead of the back this should mostly not make a difference, but feels semantically more correct --- src/whisper-utils/whisper-processing.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 7239b04..347d821 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -305,11 +305,11 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float)); if (vad_state == VAD_STATE_PARTIAL) { // peek instead of pop, since this is a partial run that keeps the data in the buffer - circlebuf_peek_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100, - pcm32f_size * sizeof(float)); + circlebuf_peek_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100, + pcm32f_size * sizeof(float)); } else { - circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100, - pcm32f_size * sizeof(float)); + circlebuf_pop_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100, + pcm32f_size * sizeof(float)); } struct DetectionResultWithText inference_result = From 12db51e052570b1b22aa67f2c8af2c47def408b5 Mon Sep 17 00:00:00 2001 From: Ruwen Hahn Date: Tue, 6 Aug 2024 15:16:01 +0200 Subject: [PATCH 2/9] Initialize `resampled_buffer` for offline tests --- src/tests/localvocal-offline-test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp index 444ddce..89b28ea 100644 --- a/src/tests/localvocal-offline-test.cpp +++ b/src/tests/localvocal-offline-test.cpp @@ -101,6 +101,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p } circlebuf_init(&gf->info_buffer); circlebuf_init(&gf->whisper_buffer); + circlebuf_init(&gf->resampled_buffer); // allocate copy buffers gf->copy_buffers[0] = @@ -307,6 +308,7 @@ void release_context(transcription_filter_data *gf) } circlebuf_free(&gf->info_buffer); circlebuf_free(&gf->whisper_buffer); + circlebuf_free(&gf->resampled_buffer); delete gf; } From 9ad20fc07726fb170e4582e2778a85c966900eb6 Mon Sep 17 00:00:00 2001 From: Ruwen Hahn Date: Tue, 6 Aug 2024 16:25:39 +0200 Subject: [PATCH 3/9] Read relevant audio bytes There are two issues here: 1. `line_size` may contain padding (didn't happen in my tests) 2. from: https://git.ffmpeg.org/gitweb/ffmpeg.git/blob/2b5f000d3f6f9e737e918a5438e6c881f65e70e2:/libavutil/frame.h#l405 > For audio, only linesize[0] may be set. For planar audio, each > channel plane must be the same size. --- src/tests/audio-file-utils.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/audio-file-utils.cpp b/src/tests/audio-file-utils.cpp index 93247be..57322e8 100644 --- a/src/tests/audio-file-utils.cpp +++ b/src/tests/audio-file-utils.cpp @@ -108,7 +108,8 @@ read_audio_file(const char *filename, std::function initializati for (int j = 0; j < codecContext->channels; j++) { buffer[j].insert(buffer[j].end(), frame->data[j], frame->data[j] + - frame->linesize[0]); + frame->nb_samples * + sizeof(float)); } } } From befdd6503939c6607431d716fbb38ae13f6011fb Mon Sep 17 00:00:00 2001 From: Ruwen Hahn Date: Tue, 6 Aug 2024 16:27:22 +0200 Subject: [PATCH 4/9] log running time in addition to local time --- src/tests/localvocal-offline-test.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp index 89b28ea..2f1fbdb 100644 --- a/src/tests/localvocal-offline-test.cpp +++ b/src/tests/localvocal-offline-test.cpp @@ -31,6 +31,7 @@ void obs_log(int log_level, const char *format, ...) { + static auto start = std::chrono::system_clock::now(); if (log_level == LOG_DEBUG) { return; } @@ -43,9 +44,14 @@ void obs_log(int log_level, const char *format, ...) std::time_t now_time_t = std::chrono::system_clock::to_time_t(now); std::tm now_tm = *std::localtime(&now_time_t); + auto diff = now - start; + // print timestamp - printf("[%02d:%02d:%02d.%03d] ", now_tm.tm_hour, now_tm.tm_min, now_tm.tm_sec, - (int)(epoch.count() % 1000)); + printf("[%02d:%02d:%02d.%03d] [%02d:%02lld.%03lld] ", now_tm.tm_hour, now_tm.tm_min, + now_tm.tm_sec, (int)(epoch.count() % 1000), + std::chrono::duration_cast(diff).count(), + std::chrono::duration_cast(diff).count() % 60, + std::chrono::duration_cast(diff).count() % 1000); // print log level switch (log_level) { From 407ac470d7cbdb1e44d5f5a0daaee43a56a55a9f Mon Sep 17 00:00:00 2001 From: Ruwen Hahn Date: Tue, 6 Aug 2024 16:29:43 +0200 Subject: [PATCH 5/9] Run whisper test "as fast as possible" This kind of behaves like libobs, where each chunk of audio is inspected individually by VAD/whisper, until processing of either takes longer than the window length, in which case audio continues to stream in --- src/tests/localvocal-offline-test.cpp | 76 ++++++++++++++++++++------- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp index 2f1fbdb..5887b72 100644 --- a/src/tests/localvocal-offline-test.cpp +++ b/src/tests/localvocal-offline-test.cpp @@ -428,19 +428,23 @@ int wmain(int argc, wchar_t *argv[]) std::remove("segments.json"); } + const auto window_size_in_ms = std::chrono::milliseconds(25); + // fill up the whisper buffer { gf->start_timestamp_ms = now_ms(); obs_log(LOG_INFO, "Sending samples to whisper buffer"); // 25 ms worth of frames - int frames = gf->sample_rate * 25 / 1000; + int frames = gf->sample_rate * window_size_in_ms.count() / 1000; const int frame_size_bytes = sizeof(float); int frames_size_bytes = frames * frame_size_bytes; int frames_count = 0; int64_t start_time = std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()) .count(); + auto start_time_time = std::chrono::system_clock::now(); + uint64_t window_number = 0; while (true) { // check if there are enough frames left in the audio buffer if ((frames_count + frames) > (audio[0].size() / frame_size_bytes)) { @@ -449,31 +453,63 @@ int wmain(int argc, wchar_t *argv[]) frames_size_bytes = frames * frame_size_bytes; } { - std::lock_guard lock(gf->whisper_buf_mutex); - - // push back current audio data to input circlebuf - for (size_t c = 0; c < gf->channels; c++) { - circlebuf_push_back(&gf->input_buffers[c], - audio[c].data() + - frames_count * frame_size_bytes, - frames_size_bytes); + bool wait = false; + auto max_wait = + start_time_time + (window_number * window_size_in_ms); + for (;;) { + { + std::lock_guard lock( + gf->whisper_buf_mutex); + wait = gf->input_buffers->size != 0; + } + if (!wait) + break; + + // sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS + auto now = std::chrono::system_clock::now(); + if (now > max_wait) + break; + + auto wait_start = now + std::chrono::milliseconds(1); + auto wait_until = max_wait > wait_start ? wait_start + : max_wait; +#if 0 + obs_log(LOG_INFO, "sleeping %lld ms", + std::chrono::duration_cast( + (wait_until - + std::chrono::system_clock::now())) + .count()); +#endif + std::this_thread::sleep_until(wait_until); + } + + { + std::lock_guard lock(gf->whisper_buf_mutex); + // push back current audio data to input circlebuf + for (size_t c = 0; c < gf->channels; c++) { + circlebuf_push_back( + &gf->input_buffers[c], + audio[c].data() + + frames_count * frame_size_bytes, + frames_size_bytes); + } + // push audio packet info (timestamp/frame count) to info circlebuf + struct transcription_filter_audio_info info = {0}; + info.frames = frames; // number of frames in this packet + // make a timestamp from the current position in the audio buffer + info.timestamp_offset_ns = + start_time + (int64_t)(((float)frames_count / + (float)gf->sample_rate) * + 1e9); + circlebuf_push_back(&gf->info_buffer, &info, sizeof(info)); } - // push audio packet info (timestamp/frame count) to info circlebuf - struct transcription_filter_audio_info info = {0}; - info.frames = frames; // number of frames in this packet - // make a timestamp from the current position in the audio buffer - info.timestamp_offset_ns = - start_time + - (int64_t)(((float)frames_count / (float)gf->sample_rate) * - 1e9); - circlebuf_push_back(&gf->info_buffer, &info, sizeof(info)); + gf->wshiper_thread_cv.notify_one(); } frames_count += frames; + window_number += 1; if (frames_count >= audio[0].size() / frame_size_bytes) { break; } - // sleep for 25 ms - std::this_thread::sleep_for(std::chrono::milliseconds(25)); } // push a second of silence to the input circlebuf frames = 2 * gf->sample_rate; From 2c42001c57416c7599e2924f2243106d92b2ee45 Mon Sep 17 00:00:00 2001 From: Ruwen Hahn Date: Tue, 6 Aug 2024 16:42:42 +0200 Subject: [PATCH 6/9] Only ever send a single chunk of audio --- src/tests/localvocal-offline-test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp index 5887b72..ad14ef5 100644 --- a/src/tests/localvocal-offline-test.cpp +++ b/src/tests/localvocal-offline-test.cpp @@ -467,7 +467,7 @@ int wmain(int argc, wchar_t *argv[]) // sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS auto now = std::chrono::system_clock::now(); - if (now > max_wait) + if (false && now > max_wait) break; auto wait_start = now + std::chrono::milliseconds(1); From e9581f30793cfd3790bf5e46594e7acee340750c Mon Sep 17 00:00:00 2001 From: Ruwen Hahn Date: Tue, 6 Aug 2024 16:43:58 +0200 Subject: [PATCH 7/9] Add additional files to tests copy command --- src/tests/copy_dlls.ps1 | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/tests/copy_dlls.ps1 b/src/tests/copy_dlls.ps1 index c797971..003472f 100644 --- a/src/tests/copy_dlls.ps1 +++ b/src/tests/copy_dlls.ps1 @@ -20,21 +20,23 @@ $obsDlls = @( ".\release\Release\obs-plugins\64bit\onnxruntime_providers_shared.dll", ".\release\Release\obs-plugins\64bit\onnxruntime.dll", ".\release\Release\obs-plugins\64bit\whisper.dll", - ".deps\obs-deps-2023-11-03-x64\bin\avcodec-60.dll", - ".deps\obs-deps-2023-11-03-x64\bin\avdevice-60.dll", - ".deps\obs-deps-2023-11-03-x64\bin\avfilter-9.dll", - ".deps\obs-deps-2023-11-03-x64\bin\avformat-60.dll", - ".deps\obs-deps-2023-11-03-x64\bin\avutil-58.dll", - ".deps\obs-deps-2023-11-03-x64\bin\libx264-164.dll", - ".deps\obs-deps-2023-11-03-x64\bin\swresample-4.dll", - ".deps\obs-deps-2023-11-03-x64\bin\swscale-7.dll", - ".deps\obs-deps-2023-11-03-x64\bin\zlib.dll" - ".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll", - ".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs.dll", - ".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll" + ".\release\Release\obs-plugins\64bit\ggml.dll", + ".deps\obs-deps-2024-03-19-x64\bin\avcodec-60.dll", + ".deps\obs-deps-2024-03-19-x64\bin\avdevice-60.dll", + ".deps\obs-deps-2024-03-19-x64\bin\avfilter-9.dll", + ".deps\obs-deps-2024-03-19-x64\bin\avformat-60.dll", + ".deps\obs-deps-2024-03-19-x64\bin\avutil-58.dll", + ".deps\obs-deps-2024-03-19-x64\bin\libx264-164.dll", + ".deps\obs-deps-2024-03-19-x64\bin\swresample-4.dll", + ".deps\obs-deps-2024-03-19-x64\bin\swscale-7.dll", + ".deps\obs-deps-2024-03-19-x64\bin\zlib.dll" + ".deps\obs-deps-2024-03-19-x64\bin\librist.dll" + ".deps\obs-deps-2024-03-19-x64\bin\srt.dll" + ".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll", + ".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs.dll", + ".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll" ) $obsDlls | ForEach-Object { Copy-Item -Force -Path $_ -Destination $testToolPath } - From b5f994fc2b8d7ee9da4fe2a75673fb6071d94254 Mon Sep 17 00:00:00 2001 From: Ruwen Hahn Date: Thu, 8 Aug 2024 13:59:57 +0200 Subject: [PATCH 8/9] Use condition variable to signal input thread if available --- src/tests/localvocal-offline-test.cpp | 48 +++++++++--------------- src/transcription-filter-data.h | 1 + src/whisper-utils/whisper-processing.cpp | 3 ++ 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp index ad14ef5..b56a0cd 100644 --- a/src/tests/localvocal-offline-test.cpp +++ b/src/tests/localvocal-offline-test.cpp @@ -101,6 +101,7 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p gf->process_while_muted = false; gf->buffered_output = false; gf->fix_utf8 = true; + gf->input_cv.emplace(); for (size_t i = 0; i < gf->channels; i++) { circlebuf_init(&gf->input_buffers[i]); @@ -453,38 +454,23 @@ int wmain(int argc, wchar_t *argv[]) frames_size_bytes = frames * frame_size_bytes; } { - bool wait = false; - auto max_wait = - start_time_time + (window_number * window_size_in_ms); - for (;;) { - { - std::lock_guard lock( - gf->whisper_buf_mutex); - wait = gf->input_buffers->size != 0; - } - if (!wait) - break; - - // sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS - auto now = std::chrono::system_clock::now(); - if (false && now > max_wait) - break; - - auto wait_start = now + std::chrono::milliseconds(1); - auto wait_until = max_wait > wait_start ? wait_start - : max_wait; -#if 0 - obs_log(LOG_INFO, "sleeping %lld ms", - std::chrono::duration_cast( - (wait_until - - std::chrono::system_clock::now())) - .count()); -#endif - std::this_thread::sleep_until(wait_until); - } - { - std::lock_guard lock(gf->whisper_buf_mutex); + auto max_wait = start_time_time + + (window_number * window_size_in_ms); + std::unique_lock lock(gf->whisper_buf_mutex); + for (;;) { + // sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS + auto now = std::chrono::system_clock::now(); + if (false && now > max_wait) + break; + + gf->input_cv->wait_for( + lock, std::chrono::milliseconds(10), [&] { + return gf->input_buffers->size == 0; + }); + if (gf->input_buffers->size == 0) + break; + } // push back current audio data to input circlebuf for (size_t c = 0; c < gf->channels; c++) { circlebuf_push_back( diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index 8a640f3..4b16d13 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -104,6 +104,7 @@ struct transcription_filter_data { std::mutex whisper_buf_mutex; std::mutex whisper_ctx_mutex; std::condition_variable wshiper_thread_cv; + std::optional input_cv; // translation context struct translation_context translation_ctx; diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 347d821..e0f190c 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -599,6 +599,9 @@ void whisper_loop(void *data) } } + if (gf->input_cv.has_value()) + gf->input_cv->notify_one(); + // Sleep using the condition variable wshiper_thread_cv // This will wake up the thread if there is new data in the input buffer // or if the whisper context is null From bb73bcf97e1ea19255c4f0e6b4d656ce21b99215 Mon Sep 17 00:00:00 2001 From: Ruwen Hahn Date: Thu, 8 Aug 2024 14:00:36 +0200 Subject: [PATCH 9/9] Only wait in whisper thread if input buffers are empty --- src/whisper-utils/whisper-processing.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index e0f190c..6d2d76e 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -606,7 +606,9 @@ void whisper_loop(void *data) // This will wake up the thread if there is new data in the input buffer // or if the whisper context is null std::unique_lock lock(gf->whisper_ctx_mutex); - gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50)); + if (gf->input_buffers->size == 0) { + gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50)); + } } obs_log(gf->log_level, "Exiting whisper thread");