From 10107d482e3dd41057c5c9c930f6710aabf674c5 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Fri, 13 Sep 2024 21:06:49 -0400 Subject: [PATCH 01/20] chore: Update ONNX Runtime version to 1.19.2 and adjust corresponding hashes --- cmake/FetchOnnxruntime.cmake | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/FetchOnnxruntime.cmake b/cmake/FetchOnnxruntime.cmake index 0ed2975..940bada 100644 --- a/cmake/FetchOnnxruntime.cmake +++ b/cmake/FetchOnnxruntime.cmake @@ -8,7 +8,7 @@ set(CUSTOM_ONNXRUNTIME_HASH "" CACHE STRING "Hash of a downloaded ONNX Runtime tarball") -set(Onnxruntime_VERSION "1.17.1") +set(Onnxruntime_VERSION "1.19.2") if(CUSTOM_ONNXRUNTIME_URL STREQUAL "") set(USE_PREDEFINED_ONNXRUNTIME ON) @@ -25,17 +25,17 @@ if(USE_PREDEFINED_ONNXRUNTIME) if(APPLE) set(Onnxruntime_URL "${Onnxruntime_BASEURL}/onnxruntime-osx-universal2-${Onnxruntime_VERSION}.tgz") - set(Onnxruntime_HASH SHA256=9FA57FA6F202A373599377EF75064AE568FDA8DA838632B26A86024C7378D306) + set(Onnxruntime_HASH SHA256=b0289ddbc32f76e5d385abc7b74cc7c2c51cdf2285b7d118bf9d71206e5aee3a) elseif(MSVC) set(Onnxruntime_URL "${Onnxruntime_BASEURL}/onnxruntime-win-x64-${Onnxruntime_VERSION}.zip") - set(OOnnxruntime_HASH SHA256=4802AF9598DB02153D7DA39432A48823FF69B2FB4B59155461937F20782AA91C) + set(OOnnxruntime_HASH SHA256=dc4f841e511977c0a4f02e5066c3d9a58427644010ab4f89b918614a1cd4c2b0) else() if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") set(Onnxruntime_URL "${Onnxruntime_BASEURL}/onnxruntime-linux-aarch64-${Onnxruntime_VERSION}.tgz") - set(Onnxruntime_HASH SHA256=70B6F536BB7AB5961D128E9DBD192368AC1513BFFB74FE92F97AAC342FBD0AC1) + set(Onnxruntime_HASH SHA256=dc4f841e511977c0a4f02e5066c3d9a58427644010ab4f89b918614a1cd4c2b0) else() set(Onnxruntime_URL "${Onnxruntime_BASEURL}/onnxruntime-linux-x64-gpu-${Onnxruntime_VERSION}.tgz") - set(Onnxruntime_HASH SHA256=613C53745EA4960ED368F6B3AB673558BB8561C84A8FA781B4EA7FB4A4340BE4) + set(Onnxruntime_HASH SHA256=4d1c10f0b410b67261302c6e18bb1b05ba924ca9081e3a26959e0d12ab69f534) endif() endif() else() From 6a5e1e7b787cc49dc7a84b9a018275c2bd9354b0 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Tue, 1 Oct 2024 02:24:45 -0400 Subject: [PATCH 02/20] refactor: Add stenographer options and resample utils This commit adds the following changes: - Added a new section for stenographer options in the filter properties - Implemented resample utilities for handling audio data These changes enable the use of stenographer functionality and provide support for resampling audio data. --- CMakeLists.txt | 24 +- data/locale/en-US.ini | 3 +- src/stenographer/stenographer.cpp | 220 ++++++++++++++++++ src/stenographer/stenographer.h | 33 +++ src/stenographer/stenographer_interface.html | 223 +++++++++++++++++++ src/transcription-filter-callbacks.cpp | 6 +- src/transcription-filter-data.h | 4 + src/transcription-filter-properties.cpp | 9 + src/transcription-filter.cpp | 45 +++- src/transcription-utils.cpp | 46 ++-- src/whisper-utils/resample-utils.cpp | 98 ++++++++ src/whisper-utils/resample-utils.h | 10 + src/whisper-utils/vad-processing.cpp | 96 +------- 13 files changed, 702 insertions(+), 115 deletions(-) create mode 100644 src/stenographer/stenographer.cpp create mode 100644 src/stenographer/stenographer.h create mode 100644 src/stenographer/stenographer_interface.html create mode 100644 src/whisper-utils/resample-utils.cpp create mode 100644 src/whisper-utils/resample-utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9233158..f29aee0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,6 +101,26 @@ include(cmake/BuildICU.cmake) target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU) target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR}) +if(NOT buildspec) +file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec) +endif() +string( + JSON + version + GET + ${buildspec} + dependencies + prebuilt + version) +if(MSVC) +set(arch ${CMAKE_GENERATOR_PLATFORM}) +elseif(APPLE) +set(arch universal) +endif() +set(deps_root "${CMAKE_CURRENT_SOURCE_DIR}/.deps/obs-deps-${version}-${arch}") +message(STATUS "deps_root: ${deps_root}") +target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE "${deps_root}/include") + target_sources( ${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c @@ -120,12 +140,14 @@ target_sources( src/whisper-utils/silero-vad-onnx.cpp src/whisper-utils/token-buffer-thread.cpp src/whisper-utils/vad-processing.cpp + src/whisper-utils/resample-utils.cpp src/translation/language_codes.cpp src/translation/translation.cpp src/translation/translation-utils.cpp src/ui/filter-replace-utils.cpp src/translation/translation-language-utils.cpp - src/ui/filter-replace-dialog.cpp) + src/ui/filter-replace-dialog.cpp + src/stenographer/stenographer.cpp) set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name}) diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index 9ef4d18..94eb613 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -87,4 +87,5 @@ Active_VAD="Active VAD" Hybrid_VAD="Hybrid VAD" translate_only_full_sentences="Translate only full sentences" duration_filter_threshold="Duration filter" -segment_duration="Segment duration" \ No newline at end of file +segment_duration="Segment duration" +stenographer_parameters="Stenographer Options" diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp new file mode 100644 index 0000000..b8af2c6 --- /dev/null +++ b/src/stenographer/stenographer.cpp @@ -0,0 +1,220 @@ +#include + +#include "stenographer.h" +#include "plugin-support.h" +#include "whisper-utils/resample-utils.h" +#include "transcription-utils.h" + +#define ASIO_STANDALONE +#define _WEBSOCKETPP_CPP11_TYPE_TRAITS_ + +#include +#include +#include +#include +#include +#include +#include + +using json = nlohmann::json; +typedef websocketpp::server server; + +// WAV header structure +struct WAVHeader { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t overall_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt_chunk_marker[4] = {'f', 'm', 't', ' '}; + uint32_t length_of_fmt = 16; + uint16_t format_type = 1; + uint16_t channels = 1; + uint32_t sample_rate = 16000; + uint32_t byterate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data_chunk_header[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; +}; + +class TranscriptionHandler::Impl { +public: + using MessageCallback = TranscriptionHandler::MessageCallback; + + explicit Impl(transcription_filter_data *gf_, MessageCallback callback) + : gf(gf_), + messageCallback(callback), + running(false) + { + server.init_asio(); + + server.set_open_handler([this](websocketpp::connection_hdl hdl) { + std::lock_guard lock(mutex); + connection = hdl; + }); + + server.set_message_handler( + [this](websocketpp::connection_hdl hdl, server::message_ptr msg) { + handleIncomingMessage(msg->get_payload()); + }); + + // Initialize WAV header + wavHeader.byterate = + wavHeader.sample_rate * wavHeader.channels * wavHeader.bits_per_sample / 8; + wavHeader.block_align = wavHeader.channels * wavHeader.bits_per_sample / 8; + } + + void start() + { + if (!running) { + running = true; + serverThread = std::async(std::launch::async, [this]() { + server.listen(9002); + server.start_accept(); + server.run(); + }); + + processingThread = + std::async(std::launch::async, [this]() { processAudioQueue(); }); + } + } + + void stop() + { + if (running) { + running = false; + server.stop(); + if (serverThread.valid()) + serverThread.wait(); + if (processingThread.valid()) + processingThread.wait(); + } + } + +private: + transcription_filter_data *gf; + server server; + websocketpp::connection_hdl connection; + MessageCallback messageCallback; + std::queue> audioQueue; + std::mutex mutex; + std::atomic running; + std::future serverThread; + std::future processingThread; + + void handleIncomingMessage(const std::string &message) + { + try { + json j = json::parse(message); + std::string type = j["type"].get(); + std::string text = j["text"].get(); + + uint64_t start_timestamp = j["start_timestamp"].get(); + uint64_t end_timestamp = j["end_timestamp"].get(); + + messageCallback(type, text, start_timestamp, end_timestamp); + } catch (json::parse_error &e) { + obs_log(LOG_ERROR, "Failed to parse JSON message: %s", e.what()); + } catch (json::type_error &e) { + obs_log(LOG_ERROR, "Failed to parse JSON message: %s", e.what()); + } + } + + void processAudioQueue() + { + while (running) { + // get data from buffer and resample + uint64_t start_timestamp_offset_ns = 0; + uint64_t end_timestamp_offset_ns = 0; + + const int ret = get_data_from_buf_and_resample( + gf, start_timestamp_offset_ns, end_timestamp_offset_ns); + if (ret != 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + continue; + } + + std::vector audio_input; + audio_input.resize(gf->resampled_buffer.size / sizeof(float)); + circlebuf_pop_front(&gf->resampled_buffer, audio_input.data(), + audio_input.size() * sizeof(float)); + + std::vector pcmData(audio_input.size()); + for (size_t i = 0; i < audio_input.size(); ++i) { + pcmData[i] = static_cast(audio_input[i] * 32767.0f); + } + + if (!pcmData.empty()) { + json timestampInfo = {{"start_timestamp", + start_timestamp_offset_ns}, + {"end_timestamp", end_timestamp_offset_ns}}; + if (connection.lock()) { + server.send(connection, timestampInfo.dump(), + websocketpp::frame::opcode::text); + } + sendAudioData(pcmData); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + + if (!gf->cleared_last_sub) { + // check if we should clear the current sub depending on the minimum subtitle duration + uint64_t now = now_ms(); + if ((now - gf->last_sub_render_time) > gf->max_sub_duration) { + // clear the current sub, call the callback with an empty string + obs_log(gf->log_level, + "Clearing current subtitle. now: %lu ms, last: %lu ms", + now, gf->last_sub_render_time); + clear_current_caption(gf); + } + } + } + } + + WAVHeader wavHeader; + std::vector audioBuffer; + + void sendAudioData(const std::vector &audioData) + { + std::lock_guard lock(mutex); + if (connection.lock()) { + audioBuffer.insert(audioBuffer.end(), audioData.begin(), audioData.end()); + + // If we have accumulated enough data, send it as a WAV file + if (audioBuffer.size() >= 8000) { // 0.5 seconds of audio at 16kHz + wavHeader.data_size = audioBuffer.size() * sizeof(int16_t); + wavHeader.overall_size = + wavHeader.data_size + sizeof(WAVHeader) - 8; + + std::vector wavData(sizeof(WAVHeader) + + wavHeader.data_size); + std::memcpy(wavData.data(), &wavHeader, sizeof(WAVHeader)); + std::memcpy(wavData.data() + sizeof(WAVHeader), audioBuffer.data(), + wavHeader.data_size); + + server.send(connection, wavData.data(), wavData.size(), + websocketpp::frame::opcode::binary); + + audioBuffer.clear(); + } + } + } +}; + +TranscriptionHandler::TranscriptionHandler(transcription_filter_data *gf_, MessageCallback callback) + : pimpl(std::make_unique(std::move(gf_), std::move(callback))) +{ +} + +TranscriptionHandler::~TranscriptionHandler() = default; + +TranscriptionHandler::TranscriptionHandler(TranscriptionHandler &&) noexcept = default; +TranscriptionHandler &TranscriptionHandler::operator=(TranscriptionHandler &&) noexcept = default; + +void TranscriptionHandler::start() +{ + pimpl->start(); +} +void TranscriptionHandler::stop() +{ + pimpl->stop(); +} \ No newline at end of file diff --git a/src/stenographer/stenographer.h b/src/stenographer/stenographer.h new file mode 100644 index 0000000..cb7a26e --- /dev/null +++ b/src/stenographer/stenographer.h @@ -0,0 +1,33 @@ +#pragma once + +// Forward declaration +struct transcription_filter_data; + +#include +#include +#include + +class TranscriptionHandler { +public: + using MessageCallback = + std::function; + + explicit TranscriptionHandler(transcription_filter_data *gf_, MessageCallback callback); + ~TranscriptionHandler(); + + // Disable copy + TranscriptionHandler(const TranscriptionHandler &) = delete; + TranscriptionHandler &operator=(const TranscriptionHandler &) = delete; + + // Enable move + TranscriptionHandler(TranscriptionHandler &&) noexcept; + TranscriptionHandler &operator=(TranscriptionHandler &&) noexcept; + + void start(); + void stop(); + +private: + class Impl; + std::unique_ptr pimpl; +}; \ No newline at end of file diff --git a/src/stenographer/stenographer_interface.html b/src/stenographer/stenographer_interface.html new file mode 100644 index 0000000..0b5eea8 --- /dev/null +++ b/src/stenographer/stenographer_interface.html @@ -0,0 +1,223 @@ + + + + + + Stenographer Interface + + + +

Stenographer Interface

+ + + +
Connection Status: Disconnected
+
Audio Status: Not started
+ + + + \ No newline at end of file diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp index ff204b4..049c0b3 100644 --- a/src/transcription-filter-callbacks.cpp +++ b/src/transcription-filter-callbacks.cpp @@ -208,7 +208,9 @@ void set_text_callback(struct transcription_filter_data *gf, str_copy = fix_utf8(str_copy); } else { // only remove leading and trailing non-alphanumeric characters if the output is English + obs_log(LOG_INFO, "before: %s", str_copy.c_str()); str_copy = remove_leading_trailing_nonalpha(str_copy); + obs_log(LOG_INFO, "after: %s", str_copy.c_str()); } // if suppression is enabled, check if the text is in the suppression list @@ -411,7 +413,9 @@ void enable_callback(void *data_, calldata_t *cd) obs_log(gf_->log_level, "enable_callback: enable"); gf_->active = true; reset_caption_state(gf_); - update_whisper_model(gf_); + if (!gf_->stenographer_enabled) { + update_whisper_model(gf_); + } } else { obs_log(gf_->log_level, "enable_callback: disable"); gf_->active = false; diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index e8990be..5e7125e 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -19,6 +19,7 @@ #include "whisper-utils/silero-vad-onnx.h" #include "whisper-utils/whisper-processing.h" #include "whisper-utils/token-buffer-thread.h" +#include "stenographer/stenographer.h" #define MAX_PREPROC_CHANNELS 10 @@ -128,6 +129,9 @@ struct transcription_filter_data { TokenBufferSegmentation buffered_output_output_type = TokenBufferSegmentation::SEGMENTATION_TOKEN; + bool stenographer_enabled = false; + TranscriptionHandler *transcription_handler = nullptr; + // ctor transcription_filter_data() : whisper_buf_mutex(), whisper_ctx_mutex(), wshiper_thread_cv() { diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp index a2c9da1..08ed7b1 100644 --- a/src/transcription-filter-properties.cpp +++ b/src/transcription-filter-properties.cpp @@ -504,6 +504,14 @@ void add_general_group_properties(obs_properties_t *ppts) } } +void add_stenographer_group_properties(obs_properties_t *ppts) +{ + // add group for stenographer options + obs_properties_t *stenographer_group = obs_properties_create(); + obs_properties_add_group(ppts, "stenographer_group", MT_("stenographer_parameters"), + OBS_GROUP_CHECKABLE, stenographer_group); +} + void add_partial_group_properties(obs_properties_t *ppts) { // add a group for partial transcription @@ -544,6 +552,7 @@ obs_properties_t *transcription_filter_properties(void *data) add_advanced_group_properties(ppts, gf); add_logging_group_properties(ppts); add_partial_group_properties(ppts); + add_stenographer_group_properties(ppts); add_whisper_params_group_properties(ppts); // Add a informative text about the plugin diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 65ae072..df1675b 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -80,7 +80,7 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_ return audio; } - if (gf->whisper_context == nullptr) { + if (gf->whisper_context == nullptr && !gf->stenographer_enabled) { // Whisper not initialized, just pass through return audio; } @@ -103,6 +103,8 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_ circlebuf_push_back(&gf->input_buffers[c], audio->data[c], audio->frames * sizeof(float)); } + obs_log(gf->log_level, "currently %lu bytes in the audio input buffer", + gf->input_buffers[0].size); // push audio packet info (timestamp/frame count) to info circlebuf struct transcription_filter_audio_info info = {0}; info.frames = audio->frames; // number of frames in this packet @@ -164,6 +166,10 @@ void transcription_filter_destroy(void *data) if (gf->translation_monitor.isEnabled()) { gf->translation_monitor.stopThread(); } + if (gf->transcription_handler != nullptr) { + gf->transcription_handler->stop(); + delete gf->transcription_handler; + } bfree(gf); } @@ -404,7 +410,11 @@ void transcription_filter_update(void *data, obs_data_t *s) } } - if (gf->context != nullptr && obs_source_enabled(gf->context)) { + // check if stenographer is enabled + bool new_stenographer_enabled = obs_data_get_bool(s, "stenographer_group"); + + if (!new_stenographer_enabled && gf->context != nullptr && + obs_source_enabled(gf->context)) { if (gf->initial_creation) { obs_log(LOG_INFO, "Initial filter creation and source enabled"); @@ -424,6 +434,37 @@ void transcription_filter_update(void *data, obs_data_t *s) } } } + + if (new_stenographer_enabled != gf->stenographer_enabled) { + gf->stenographer_enabled = new_stenographer_enabled; + if (gf->stenographer_enabled) { + obs_log(gf->log_level, "Stenographer enabled"); + shutdown_whisper_thread(gf); // stop whisper + gf->transcription_handler = new TranscriptionHandler( + gf, [gf](const std::string &type, const std::string &text, + uint64_t start_timestamp, uint64_t end_timestamp) { + // send_caption_to_source(gf->text_source_name, text, gf); + DetectionResultWithText result; + result.text = text; + result.result = + (type == "partial") + ? DetectionResult::DETECTION_RESULT_PARTIAL + : DetectionResult::DETECTION_RESULT_SPEECH; + result.start_timestamp_ms = start_timestamp; + result.end_timestamp_ms = end_timestamp; + set_text_callback(gf, result); + }); + gf->transcription_handler->start(); + } else { + obs_log(gf->log_level, "Stenographer disabled"); + if (gf->transcription_handler) { + gf->transcription_handler->stop(); + delete gf->transcription_handler; + gf->transcription_handler = nullptr; + } + update_whisper_model(gf); // restart whisper + } + } } void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) diff --git a/src/transcription-utils.cpp b/src/transcription-utils.cpp index 727d3df..48c284a 100644 --- a/src/transcription-utils.cpp +++ b/src/transcription-utils.cpp @@ -82,6 +82,36 @@ std::string fix_utf8(const std::string &str) #endif } +/** + * @brief Trims leading and trailing whitespace characters from the given string. + * + * This function removes any whitespace characters (spaces, tabs, newlines, etc.) + * from the beginning and end of the input string, returning a new string with + * the whitespace removed. + * + * @param str The input string to be trimmed. + * @return A new string with leading and trailing whitespace removed. + */ +std::string trim(const std::string& str) { + std::string str_copy = str; + + // remove trailing spaces, newlines, tabs or punctuation + auto last_non_space = std::find_if(str_copy.rbegin(), str_copy.rend(), + [](unsigned char ch) { + return !std::isspace(ch) && !std::ispunct(ch); + }).base(); + str_copy.erase(last_non_space, str_copy.end()); + + // remove leading spaces, newlines, tabs or punctuation + auto first_non_space = std::find_if(str_copy.begin(), str_copy.end(), + [](unsigned char ch) { + return !std::isspace(ch) && !std::ispunct(ch); + }); + str_copy.erase(str_copy.begin(), first_non_space); + + return str_copy; +} + /* * Remove leading and trailing non-alphabetic characters from a string. * This function is used to remove leading and trailing spaces, newlines, tabs or punctuation. @@ -111,21 +141,7 @@ std::string remove_leading_trailing_nonalpha(const std::string &str) return ""; } } - std::string str_copy = str; - // remove trailing spaces, newlines, tabs or punctuation - auto last_non_space = - std::find_if(str_copy.rbegin(), str_copy.rend(), [](unsigned char ch) { - return !std::isspace(ch) || !std::ispunct(ch); - }).base(); - str_copy.erase(last_non_space, str_copy.end()); - // remove leading spaces, newlines, tabs or punctuation - auto first_non_space = std::find_if(str_copy.begin(), str_copy.end(), - [](unsigned char ch) { - return !std::isspace(ch) || !std::ispunct(ch); - }) + - 1; - str_copy.erase(str_copy.begin(), first_non_space); - return str_copy; + return trim(str); } std::vector split(const std::string &string, char delimiter) diff --git a/src/whisper-utils/resample-utils.cpp b/src/whisper-utils/resample-utils.cpp new file mode 100644 index 0000000..7533b61 --- /dev/null +++ b/src/whisper-utils/resample-utils.cpp @@ -0,0 +1,98 @@ +#include + +#include "resample-utils.h" + +int get_data_from_buf_and_resample(transcription_filter_data *gf, + uint64_t &start_timestamp_offset_ns, + uint64_t &end_timestamp_offset_ns) +{ + uint32_t num_frames_from_infos = 0; + + { + // scoped lock the buffer mutex + std::lock_guard lock(gf->whisper_buf_mutex); + + if (gf->input_buffers[0].size == 0) { + return 1; + } + + obs_log(gf->log_level, + "segmentation: currently %lu bytes in the audio input buffer", + gf->input_buffers[0].size); + + // max number of frames is 10 seconds worth of audio + const size_t max_num_frames = gf->sample_rate * 10; + + // pop all infos from the info buffer and mark the beginning timestamp from the first + // info as the beginning timestamp of the segment + struct transcription_filter_audio_info info_from_buf = {0}; + const size_t size_of_audio_info = sizeof(transcription_filter_audio_info); + while (gf->info_buffer.size >= size_of_audio_info) { + circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info); + num_frames_from_infos += info_from_buf.frames; + if (start_timestamp_offset_ns == 0) { + start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns; + } + // Check if we're within the needed segment length + if (num_frames_from_infos > max_num_frames) { + // too big, push the last info into the buffer's front where it was + num_frames_from_infos -= info_from_buf.frames; + circlebuf_push_front(&gf->info_buffer, &info_from_buf, + size_of_audio_info); + break; + } + } + // calculate the end timestamp from the info plus the number of frames in the packet + end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns + + info_from_buf.frames * 1000000000 / gf->sample_rate; + + if (start_timestamp_offset_ns > end_timestamp_offset_ns) { + // this may happen when the incoming media has a timestamp reset + // in this case, we should figure out the start timestamp from the end timestamp + // and the number of frames + start_timestamp_offset_ns = + end_timestamp_offset_ns - + num_frames_from_infos * 1000000000 / gf->sample_rate; + } + + for (size_t c = 0; c < gf->channels; c++) { + // zero the rest of copy_buffers + memset(gf->copy_buffers[c], 0, gf->frames * sizeof(float)); + } + + /* Pop from input circlebuf */ + for (size_t c = 0; c < gf->channels; c++) { + // Push the new data to copy_buffers[c] + circlebuf_pop_front(&gf->input_buffers[c], gf->copy_buffers[c], + num_frames_from_infos * sizeof(float)); + } + } + + obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos); + gf->last_num_frames = num_frames_from_infos; + + { + // resample to 16kHz + float *resampled_16khz[MAX_PREPROC_CHANNELS]; + uint32_t resampled_16khz_frames; + uint64_t ts_offset; + { + ProfileScope("resample"); + audio_resampler_resample(gf->resampler_to_whisper, + (uint8_t **)resampled_16khz, + &resampled_16khz_frames, &ts_offset, + (const uint8_t **)gf->copy_buffers, + (uint32_t)num_frames_from_infos); + } + + circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0], + resampled_16khz_frames * sizeof(float)); + obs_log(gf->log_level, + "resampled: %d channels, %d frames, %f ms, current size: %lu bytes", + (int)gf->channels, (int)resampled_16khz_frames, + (float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f, + gf->resampled_buffer.size); + } + + return 0; +} diff --git a/src/whisper-utils/resample-utils.h b/src/whisper-utils/resample-utils.h new file mode 100644 index 0000000..c2d2872 --- /dev/null +++ b/src/whisper-utils/resample-utils.h @@ -0,0 +1,10 @@ +#ifndef RESAMPLE_UTILS_H +#define RESAMPLE_UTILS_H + +#include "transcription-filter-data.h" + +int get_data_from_buf_and_resample(transcription_filter_data *gf, + uint64_t &start_timestamp_offset_ns, + uint64_t &end_timestamp_offset_ns); + +#endif diff --git a/src/whisper-utils/vad-processing.cpp b/src/whisper-utils/vad-processing.cpp index 0e9c744..d0a9266 100644 --- a/src/whisper-utils/vad-processing.cpp +++ b/src/whisper-utils/vad-processing.cpp @@ -4,107 +4,13 @@ #include "transcription-filter-data.h" #include "vad-processing.h" +#include "resample-utils.h" #ifdef _WIN32 #define NOMINMAX #include #endif -int get_data_from_buf_and_resample(transcription_filter_data *gf, - uint64_t &start_timestamp_offset_ns, - uint64_t &end_timestamp_offset_ns) -{ - uint32_t num_frames_from_infos = 0; - - { - // scoped lock the buffer mutex - std::lock_guard lock(gf->whisper_buf_mutex); - - if (gf->input_buffers[0].size == 0) { - return 1; - } - - obs_log(gf->log_level, - "segmentation: currently %lu bytes in the audio input buffer", - gf->input_buffers[0].size); - - // max number of frames is 10 seconds worth of audio - const size_t max_num_frames = gf->sample_rate * 10; - - // pop all infos from the info buffer and mark the beginning timestamp from the first - // info as the beginning timestamp of the segment - struct transcription_filter_audio_info info_from_buf = {0}; - const size_t size_of_audio_info = sizeof(transcription_filter_audio_info); - while (gf->info_buffer.size >= size_of_audio_info) { - circlebuf_pop_front(&gf->info_buffer, &info_from_buf, size_of_audio_info); - num_frames_from_infos += info_from_buf.frames; - if (start_timestamp_offset_ns == 0) { - start_timestamp_offset_ns = info_from_buf.timestamp_offset_ns; - } - // Check if we're within the needed segment length - if (num_frames_from_infos > max_num_frames) { - // too big, push the last info into the buffer's front where it was - num_frames_from_infos -= info_from_buf.frames; - circlebuf_push_front(&gf->info_buffer, &info_from_buf, - size_of_audio_info); - break; - } - } - // calculate the end timestamp from the info plus the number of frames in the packet - end_timestamp_offset_ns = info_from_buf.timestamp_offset_ns + - info_from_buf.frames * 1000000000 / gf->sample_rate; - - if (start_timestamp_offset_ns > end_timestamp_offset_ns) { - // this may happen when the incoming media has a timestamp reset - // in this case, we should figure out the start timestamp from the end timestamp - // and the number of frames - start_timestamp_offset_ns = - end_timestamp_offset_ns - - num_frames_from_infos * 1000000000 / gf->sample_rate; - } - - for (size_t c = 0; c < gf->channels; c++) { - // zero the rest of copy_buffers - memset(gf->copy_buffers[c], 0, gf->frames * sizeof(float)); - } - - /* Pop from input circlebuf */ - for (size_t c = 0; c < gf->channels; c++) { - // Push the new data to copy_buffers[c] - circlebuf_pop_front(&gf->input_buffers[c], gf->copy_buffers[c], - num_frames_from_infos * sizeof(float)); - } - } - - obs_log(gf->log_level, "found %d frames from info buffer.", num_frames_from_infos); - gf->last_num_frames = num_frames_from_infos; - - { - // resample to 16kHz - float *resampled_16khz[MAX_PREPROC_CHANNELS]; - uint32_t resampled_16khz_frames; - uint64_t ts_offset; - { - ProfileScope("resample"); - audio_resampler_resample(gf->resampler_to_whisper, - (uint8_t **)resampled_16khz, - &resampled_16khz_frames, &ts_offset, - (const uint8_t **)gf->copy_buffers, - (uint32_t)num_frames_from_infos); - } - - circlebuf_push_back(&gf->resampled_buffer, resampled_16khz[0], - resampled_16khz_frames * sizeof(float)); - obs_log(gf->log_level, - "resampled: %d channels, %d frames, %f ms, current size: %lu bytes", - (int)gf->channels, (int)resampled_16khz_frames, - (float)resampled_16khz_frames / WHISPER_SAMPLE_RATE * 1000.0f, - gf->resampled_buffer.size); - } - - return 0; -} - vad_state vad_based_segmentation(transcription_filter_data *gf, vad_state last_vad_state) { // get data from buffer and resample From b3a0316ccd8fda5a9411118a2c04869e11c5eb5b Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Sun, 6 Oct 2024 14:27:00 -0400 Subject: [PATCH 03/20] refactor: Add stenographer delay option Added a new option for stenographer delay in the stenographer group properties. This allows users to specify the amount of delay for partial transcription. The default delay is set to 10,000 milliseconds. Fixes # --- data/locale/en-US.ini | 1 + src/transcription-filter-data.h | 1 + src/transcription-filter-properties.cpp | 6 ++++++ src/transcription-filter.cpp | 2 +- 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index 94eb613..97d0835 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -89,3 +89,4 @@ translate_only_full_sentences="Translate only full sentences" duration_filter_threshold="Duration filter" segment_duration="Segment duration" stenographer_parameters="Stenographer Options" +stenographer_delay="Audio Delay" \ No newline at end of file diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index 5e7125e..031af38 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -131,6 +131,7 @@ struct transcription_filter_data { bool stenographer_enabled = false; TranscriptionHandler *transcription_handler = nullptr; + int stenographer_delay = 0; // ctor transcription_filter_data() : whisper_buf_mutex(), whisper_ctx_mutex(), wshiper_thread_cv() diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp index 08ed7b1..c639b03 100644 --- a/src/transcription-filter-properties.cpp +++ b/src/transcription-filter-properties.cpp @@ -510,6 +510,10 @@ void add_stenographer_group_properties(obs_properties_t *ppts) obs_properties_t *stenographer_group = obs_properties_create(); obs_properties_add_group(ppts, "stenographer_group", MT_("stenographer_parameters"), OBS_GROUP_CHECKABLE, stenographer_group); + + // add delay amount for partial transcription + obs_properties_add_int_slider(stenographer_group, "stenographer_delay", MT_("stenographer_delay"), + 1000, 12000, 100); } void add_partial_group_properties(obs_properties_t *ppts) @@ -603,6 +607,8 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_double(s, "sentence_psum_accept_thresh", 0.4); obs_data_set_default_bool(s, "partial_group", false); obs_data_set_default_int(s, "partial_latency", 1100); + obs_data_set_default_bool(s, "stenographer_group", false); + obs_data_set_default_int(s, "stenographer_delay", 10000); // translation options obs_data_set_default_double(s, "translation_sampling_temperature", 0.1); diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index df1675b..fc0a1fe 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -440,10 +440,10 @@ void transcription_filter_update(void *data, obs_data_t *s) if (gf->stenographer_enabled) { obs_log(gf->log_level, "Stenographer enabled"); shutdown_whisper_thread(gf); // stop whisper + gf->stenographer_delay = (int)obs_data_get_int(s, "stenographer_delay"); gf->transcription_handler = new TranscriptionHandler( gf, [gf](const std::string &type, const std::string &text, uint64_t start_timestamp, uint64_t end_timestamp) { - // send_caption_to_source(gf->text_source_name, text, gf); DetectionResultWithText result; result.text = text; result.result = From db155bb466500cc6b783efd17167a245ed781220 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Mon, 7 Oct 2024 12:22:56 -0400 Subject: [PATCH 04/20] refactor: Add resample-utils.cpp and update stenographer delay This commit adds the file resample-utils.cpp to the whisper-utils directory. It also updates the stenographer delay functionality in stenographer.cpp and transcription-filter.cpp. The delay is now set to 1000 milliseconds. These changes improve the audio processing and transcription capabilities of the application. --- CMakeLists.txt | 1 + src/stenographer/stenographer.cpp | 4 ++-- src/tests/localvocal-offline-test.cpp | 15 +++++++++++++++ src/transcription-filter-data.h | 3 ++- src/transcription-filter-properties.cpp | 4 ++-- src/transcription-filter.cpp | 25 +++++++++++++++++++++++++ 6 files changed, 47 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f29aee0..fb11342 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,6 +168,7 @@ if(ENABLE_TESTS) src/whisper-utils/silero-vad-onnx.cpp src/whisper-utils/token-buffer-thread.cpp src/whisper-utils/vad-processing.cpp + src/whisper-utils/resample-utils.cpp src/translation/language_codes.cpp src/translation/translation.cpp src/ui/filter-replace-utils.cpp diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp index b8af2c6..269dee1 100644 --- a/src/stenographer/stenographer.cpp +++ b/src/stenographer/stenographer.cpp @@ -122,7 +122,7 @@ class TranscriptionHandler::Impl { void processAudioQueue() { while (running) { - // get data from buffer and resample + // get data from buffer and resample to 16kHz uint64_t start_timestamp_offset_ns = 0; uint64_t end_timestamp_offset_ns = 0; @@ -217,4 +217,4 @@ void TranscriptionHandler::start() void TranscriptionHandler::stop() { pimpl->stop(); -} \ No newline at end of file +} diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp index 3c0f4a4..5635045 100644 --- a/src/tests/localvocal-offline-test.cpp +++ b/src/tests/localvocal-offline-test.cpp @@ -328,6 +328,21 @@ void set_text_callback(struct transcription_filter_data *gf, } }; +void clear_current_caption(transcription_filter_data *gf_) +{ + if (gf_->captions_monitor.isEnabled()) { + gf_->captions_monitor.clear(); + gf_->translation_monitor.clear(); + } + // reset translation context + gf_->last_text_for_translation = ""; + gf_->last_text_translation = ""; + gf_->translation_ctx.last_input_tokens.clear(); + gf_->translation_ctx.last_translation_tokens.clear(); + gf_->last_transcription_sentence.clear(); + gf_->cleared_last_sub = true; +} + void release_context(transcription_filter_data *gf) { obs_log(LOG_INFO, "destroy"); diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index 031af38..a721b8c 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -131,7 +131,8 @@ struct transcription_filter_data { bool stenographer_enabled = false; TranscriptionHandler *transcription_handler = nullptr; - int stenographer_delay = 0; + int stenographer_delay = 1000; + std::deque stenographer_delay_buffers[MAX_PREPROC_CHANNELS]; // ctor transcription_filter_data() : whisper_buf_mutex(), whisper_ctx_mutex(), wshiper_thread_cv() diff --git a/src/transcription-filter-properties.cpp b/src/transcription-filter-properties.cpp index c639b03..cc90365 100644 --- a/src/transcription-filter-properties.cpp +++ b/src/transcription-filter-properties.cpp @@ -512,8 +512,8 @@ void add_stenographer_group_properties(obs_properties_t *ppts) OBS_GROUP_CHECKABLE, stenographer_group); // add delay amount for partial transcription - obs_properties_add_int_slider(stenographer_group, "stenographer_delay", MT_("stenographer_delay"), - 1000, 12000, 100); + obs_properties_add_int_slider(stenographer_group, "stenographer_delay", + MT_("stenographer_delay"), 1000, 12000, 100); } void add_partial_group_properties(obs_properties_t *ppts) diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index fc0a1fe..93e3b9b 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -114,6 +114,31 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_ gf->wshiper_thread_cv.notify_one(); } + if (gf->stenographer_enabled) { + // Stenographer mode - apply delay. + // Store the audio data in a buffer and process it after the delay. + // push the data to the back of gf->stenographer_delay_buffer + for (size_t c = 0; c < gf->channels; c++) { + for (size_t i = 0; i < audio->frames; i++) { + gf->stenographer_delay_buffers[c].push_back(audio->data[c][i]); + } + } + + // If the buffer is larger than the delay, emit the oldest data + // Take from the buffer as much as requested by the incoming audio data + size_t delay_frames = gf->sample_rate * gf->stenographer_delay / 1000; + if (gf->stenographer_delay_buffers[0].size() >= delay_frames) { + // Replace data on the audio buffer with the delayed data + for (size_t c = 0; c < gf->channels; c++) { + for (size_t i = 0; i < audio->frames; i++) { + audio->data[c][i] = + gf->stenographer_delay_buffers[c].front(); + gf->stenographer_delay_buffers[c].pop_front(); + } + } + } + } + return audio; } From fcb79efa1ccc08487c35f4348a40dc53d3abe7f6 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Mon, 7 Oct 2024 15:40:33 -0400 Subject: [PATCH 05/20] refactor: Update stenographer delay variable name Update the variable name from "stenographer_delay" to "stenographer_delay_ms" in the transcription filter code. This change reflects the unit of the delay value in milliseconds. The code has been modified in the "transcription-filter-data.h" and "transcription-filter.cpp" files. --- src/transcription-filter-data.h | 2 +- src/transcription-filter.cpp | 43 +++++++++++++++++++++++++-------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index a721b8c..76d39e8 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -131,7 +131,7 @@ struct transcription_filter_data { bool stenographer_enabled = false; TranscriptionHandler *transcription_handler = nullptr; - int stenographer_delay = 1000; + int stenographer_delay_ms = 1000; std::deque stenographer_delay_buffers[MAX_PREPROC_CHANNELS]; // ctor diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 93e3b9b..25cbba9 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -119,22 +119,45 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_ // Store the audio data in a buffer and process it after the delay. // push the data to the back of gf->stenographer_delay_buffer for (size_t c = 0; c < gf->channels; c++) { - for (size_t i = 0; i < audio->frames; i++) { - gf->stenographer_delay_buffers[c].push_back(audio->data[c][i]); - } + // take a audio->frames * sizeof(float) bytes chunk from audio->data[c] and push it + // to the back of the buffer as a float + std::vector audio_data_chunk( + (float *)audio->data[c], ((float *)audio->data[c]) + audio->frames); + gf->stenographer_delay_buffers[c].insert( + gf->stenographer_delay_buffers[c].end(), audio_data_chunk.begin(), + audio_data_chunk.end()); } // If the buffer is larger than the delay, emit the oldest data // Take from the buffer as much as requested by the incoming audio data - size_t delay_frames = gf->sample_rate * gf->stenographer_delay / 1000; + size_t delay_frames = (size_t)((float)gf->sample_rate * + (float)gf->stenographer_delay_ms / 1000.0f) + + audio->frames; if (gf->stenographer_delay_buffers[0].size() >= delay_frames) { + obs_log(LOG_INFO, + "Stenographer delay buffer filled %lu/%lu. Sending %lu frames", + gf->stenographer_delay_buffers[0].size(), delay_frames, + audio->frames); // Replace data on the audio buffer with the delayed data for (size_t c = 0; c < gf->channels; c++) { - for (size_t i = 0; i < audio->frames; i++) { - audio->data[c][i] = - gf->stenographer_delay_buffers[c].front(); - gf->stenographer_delay_buffers[c].pop_front(); - } + // Take the oldest audio->frames from the buffer and put it in the audio buffer + // as bytes + std::vector audio_data_chunk( + gf->stenographer_delay_buffers[c].begin(), + gf->stenographer_delay_buffers[c].begin() + audio->frames); + memcpy(audio->data[c], audio_data_chunk.data(), + audio->frames * sizeof(float)); + // Remove the oldest audio->frames from the buffer + gf->stenographer_delay_buffers[c].erase( + gf->stenographer_delay_buffers[c].begin(), + gf->stenographer_delay_buffers[c].begin() + audio->frames); + } + } else { + obs_log(LOG_INFO, "Stenographer delay buffer not filled yet %lu/%lu", + gf->stenographer_delay_buffers[0].size(), delay_frames); + // Fill the audio buffer with silence + for (size_t c = 0; c < gf->channels; c++) { + memset(audio->data[c], 0, audio->frames * sizeof(float)); } } } @@ -465,7 +488,7 @@ void transcription_filter_update(void *data, obs_data_t *s) if (gf->stenographer_enabled) { obs_log(gf->log_level, "Stenographer enabled"); shutdown_whisper_thread(gf); // stop whisper - gf->stenographer_delay = (int)obs_data_get_int(s, "stenographer_delay"); + gf->stenographer_delay_ms = (int)obs_data_get_int(s, "stenographer_delay"); gf->transcription_handler = new TranscriptionHandler( gf, [gf](const std::string &type, const std::string &text, uint64_t start_timestamp, uint64_t end_timestamp) { From 49538e7f8882da5332523121e6e6443e9a294daf Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Mon, 7 Oct 2024 15:47:10 -0400 Subject: [PATCH 06/20] refactor: Update stenographer interface buttons and add pause/resume functionality --- src/stenographer/stenographer_interface.html | 76 ++++++++++++++++++-- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/src/stenographer/stenographer_interface.html b/src/stenographer/stenographer_interface.html index 0b5eea8..d50b2dc 100644 --- a/src/stenographer/stenographer_interface.html +++ b/src/stenographer/stenographer_interface.html @@ -15,11 +15,23 @@

Stenographer Interface

- + + + +
+ Connection Status: Disconnected +
+ +
+ Audio Status: Not started +
+ +
+

Hotkey: Press Alt+P to pause/resume audio

+
+ -
Connection Status: Disconnected
-
Audio Status: Not started
- \ No newline at end of file + From 37f84399041f5c43024db5188c7ab2e81c16e4f8 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Mon, 7 Oct 2024 22:58:13 -0400 Subject: [PATCH 07/20] Revert "refactor: Update stenographer interface buttons and add pause/resume functionality" This reverts commit 49538e7f8882da5332523121e6e6443e9a294daf. --- src/stenographer/stenographer_interface.html | 76 ++------------------ 1 file changed, 5 insertions(+), 71 deletions(-) diff --git a/src/stenographer/stenographer_interface.html b/src/stenographer/stenographer_interface.html index d50b2dc..0b5eea8 100644 --- a/src/stenographer/stenographer_interface.html +++ b/src/stenographer/stenographer_interface.html @@ -15,23 +15,11 @@

Stenographer Interface

- - - -
- Connection Status: Disconnected -
- -
- Audio Status: Not started -
- -
-

Hotkey: Press Alt+P to pause/resume audio

-
- + +
Connection Status: Disconnected
+
Audio Status: Not started
- + \ No newline at end of file From bca37a7b16e62b0b37d85be29344762806e38bf8 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Mon, 7 Oct 2024 23:37:16 -0400 Subject: [PATCH 08/20] refactor: Clear stenographer delay buffers when resetting caption state This commit modifies the `reset_caption_state` function in `transcription-filter-callbacks.cpp` to clear the `stenographer_delay_buffers` when resetting the caption state. This ensures that the buffers are empty and ready for new data. Additionally, the `channels` variable in `transcription-filter-data.h` is updated to represent the number of channels in the input. These changes improve the functionality and maintainability of the code. --- src/transcription-filter-callbacks.cpp | 4 ++-- src/transcription-filter-data.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp index 049c0b3..6a8169a 100644 --- a/src/transcription-filter-callbacks.cpp +++ b/src/transcription-filter-callbacks.cpp @@ -352,6 +352,7 @@ void reset_caption_state(transcription_filter_data *gf_) if (gf_->input_buffers[c].data != nullptr) { circlebuf_free(&gf_->input_buffers[c]); } + gf_->stenographer_delay_buffers[c].clear(); } if (gf_->info_buffer.data != nullptr) { circlebuf_free(&gf_->info_buffer); @@ -409,17 +410,16 @@ void enable_callback(void *data_, calldata_t *cd) { transcription_filter_data *gf_ = static_cast(data_); bool enable = calldata_bool(cd, "enabled"); + reset_caption_state(gf_); if (enable) { obs_log(gf_->log_level, "enable_callback: enable"); gf_->active = true; - reset_caption_state(gf_); if (!gf_->stenographer_enabled) { update_whisper_model(gf_); } } else { obs_log(gf_->log_level, "enable_callback: disable"); gf_->active = false; - reset_caption_state(gf_); shutdown_whisper_thread(gf_); } } diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index 76d39e8..996faa5 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -25,7 +25,7 @@ struct transcription_filter_data { obs_source_t *context; // obs filter source (this filter) - size_t channels; // number of channels + size_t channels; // number of channels in the input uint32_t sample_rate; // input sample rate // How many input frames (in input sample rate) are needed for the next whisper frame size_t frames; From aefd87d6c4d9eb59fd157bbde3be93fb4a609659 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 10 Oct 2024 09:08:40 -0400 Subject: [PATCH 09/20] Refactor CMakeLists.txt and stenographer.cpp This commit refactors the CMakeLists.txt file by adding a check for the buildspec variable and reading the buildspec.json file if it is not set. It also sets the arch variable based on the platform. In stenographer.cpp, the code now includes websocketpp/config/asio_no_tls.hpp only if the platform is not Linux. Additionally, the unused parameter hdl is now marked as unused in the message handler. --- CMakeLists.txt | 6 +++--- src/stenographer/stenographer.cpp | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6501163..4e53fb6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,7 +102,7 @@ target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU) target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR}) if(NOT buildspec) -file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec) + file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec) endif() string( JSON @@ -113,9 +113,9 @@ string( prebuilt version) if(MSVC) -set(arch ${CMAKE_GENERATOR_PLATFORM}) + set(arch ${CMAKE_GENERATOR_PLATFORM}) elseif(APPLE) -set(arch universal) + set(arch universal) endif() set(deps_root "${CMAKE_CURRENT_SOURCE_DIR}/.deps/obs-deps-${version}-${arch}") message(STATUS "deps_root: ${deps_root}") diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp index 269dee1..c11ac80 100644 --- a/src/stenographer/stenographer.cpp +++ b/src/stenographer/stenographer.cpp @@ -8,7 +8,9 @@ #define ASIO_STANDALONE #define _WEBSOCKETPP_CPP11_TYPE_TRAITS_ +#ifndef __linux__ #include +#endif #include #include #include @@ -54,6 +56,7 @@ class TranscriptionHandler::Impl { server.set_message_handler( [this](websocketpp::connection_hdl hdl, server::message_ptr msg) { + UNUSED_PARAMETER(hdl); handleIncomingMessage(msg->get_payload()); }); From 1de935758f2e369b863a58c69b67a818f3918796 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 10 Oct 2024 10:48:19 -0400 Subject: [PATCH 10/20] Refactor CMakeLists.txt and stenographer.cpp --- CMakeLists.txt | 41 ++++++++++++++++++++---------------- cmake/FetchWebsocketpp.cmake | 9 ++++++++ 2 files changed, 32 insertions(+), 18 deletions(-) create mode 100644 cmake/FetchWebsocketpp.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e53fb6..3cd971d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,25 +101,30 @@ include(cmake/BuildICU.cmake) target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU) target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR}) -if(NOT buildspec) - file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec) -endif() -string( - JSON - version - GET - ${buildspec} - dependencies - prebuilt - version) -if(MSVC) - set(arch ${CMAKE_GENERATOR_PLATFORM}) -elseif(APPLE) - set(arch universal) +if(MSVC or APPLE) + if(NOT buildspec) + file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec) + endif() + string( + JSON + version + GET + ${buildspec} + dependencies + prebuilt + version) + if(MSVC) + set(arch ${CMAKE_GENERATOR_PLATFORM}) + elseif(APPLE) + set(arch universal) + endif() + set(deps_root "${CMAKE_CURRENT_SOURCE_DIR}/.deps/obs-deps-${version}-${arch}") + message(STATUS "deps_root: ${deps_root}") + target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE "${deps_root}/include") +else() + include(cmake/FetchWebsocketpp.cmake) + target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${websocketpp_SOURCE_DIR}/) endif() -set(deps_root "${CMAKE_CURRENT_SOURCE_DIR}/.deps/obs-deps-${version}-${arch}") -message(STATUS "deps_root: ${deps_root}") -target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE "${deps_root}/include") target_sources( ${CMAKE_PROJECT_NAME} diff --git a/cmake/FetchWebsocketpp.cmake b/cmake/FetchWebsocketpp.cmake new file mode 100644 index 0000000..1c60239 --- /dev/null +++ b/cmake/FetchWebsocketpp.cmake @@ -0,0 +1,9 @@ +include(FetchContent) + +FetchContent_Declare( + websocketpp + URL https://github.com/zaphoyd/websocketpp/archive/refs/tags/0.8.2.tar.gz + URL_HASH SHA256=6ce889d85ecdc2d8fa07408d6787e7352510750daa66b5ad44aacb47bea76755 +) + +FetchContent_MakeAvailable(websocketpp) From 0c1ee7050072782b92a595b7e6620a39da81f0cf Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 10 Oct 2024 11:05:54 -0400 Subject: [PATCH 11/20] Refactor CMakeLists.txt and stenographer.cpp --- CMakeLists.txt | 3 ++- cmake/FetchWebsocketpp.cmake | 25 ++++++++++++++++--------- cmake/linux/compilerconfig.cmake | 8 ++++---- src/stenographer/stenographer.cpp | 2 -- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3cd971d..5c4890c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,7 @@ include(cmake/BuildICU.cmake) target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ICU) target_include_directories(${CMAKE_PROJECT_NAME} SYSTEM PUBLIC ${ICU_INCLUDE_DIR}) -if(MSVC or APPLE) +if(WIN32 OR APPLE) if(NOT buildspec) file(READ "${CMAKE_CURRENT_SOURCE_DIR}/buildspec.json" buildspec) endif() @@ -124,6 +124,7 @@ if(MSVC or APPLE) else() include(cmake/FetchWebsocketpp.cmake) target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${websocketpp_SOURCE_DIR}/) + target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${asio_SOURCE_DIR}/) endif() target_sources( diff --git a/cmake/FetchWebsocketpp.cmake b/cmake/FetchWebsocketpp.cmake index 1c60239..24d6451 100644 --- a/cmake/FetchWebsocketpp.cmake +++ b/cmake/FetchWebsocketpp.cmake @@ -1,9 +1,16 @@ -include(FetchContent) - -FetchContent_Declare( - websocketpp - URL https://github.com/zaphoyd/websocketpp/archive/refs/tags/0.8.2.tar.gz - URL_HASH SHA256=6ce889d85ecdc2d8fa07408d6787e7352510750daa66b5ad44aacb47bea76755 -) - -FetchContent_MakeAvailable(websocketpp) +include(FetchContent) + +FetchContent_Declare( + websocketpp + URL https://github.com/zaphoyd/websocketpp/archive/refs/tags/0.8.2.tar.gz + URL_HASH SHA256=6ce889d85ecdc2d8fa07408d6787e7352510750daa66b5ad44aacb47bea76755) + +FetchContent_MakeAvailable(websocketpp) + +# Fetch ASIO +FetchContent_Declare( + asio + URL https://github.com/chriskohlhoff/asio/archive/asio-1-28-0.tar.gz + URL_HASH SHA256=1ef87b17e5e32f1a1b4cd840acac6c2a8d0dcde365dde3f9dcd5d1eae0495290) + +FetchContent_MakeAvailable(websocketpp asio) diff --git a/cmake/linux/compilerconfig.cmake b/cmake/linux/compilerconfig.cmake index 647c4b3..8931ba3 100644 --- a/cmake/linux/compilerconfig.cmake +++ b/cmake/linux/compilerconfig.cmake @@ -21,6 +21,7 @@ set(_obs_gcc_c_options -Wformat-security -Wno-conversion -Wno-deprecated-declarations + -Wno-error=conversion -Wno-error=deprecated-declarations -Wno-float-conversion -Wno-implicit-fallthrough @@ -42,14 +43,13 @@ set(_obs_gcc_c_options -Wvla) # gcc options for C++ -set(_obs_gcc_cxx_options - # cmake-format: sortable - ${_obs_gcc_c_options} -Wconversion -Wfloat-conversion -Winvalid-offsetof -Wno-overloaded-virtual) +set(_obs_gcc_cxx_options # cmake-format: sortable + ${_obs_gcc_c_options} -Winvalid-offsetof -Wno-overloaded-virtual) add_compile_options( -fopenmp-simd "$<$:${_obs_gcc_c_options}>" - "$<$:-Wint-conversion;-Wno-missing-prototypes;-Wno-strict-prototypes;-Wpointer-sign>" + "$<$:-Wno-missing-prototypes;-Wno-strict-prototypes;-Wpointer-sign>" "$<$:${_obs_gcc_cxx_options}>" "$<$:${_obs_clang_c_options}>" "$<$:${_obs_clang_cxx_options}>") diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp index c11ac80..f5a57a9 100644 --- a/src/stenographer/stenographer.cpp +++ b/src/stenographer/stenographer.cpp @@ -8,9 +8,7 @@ #define ASIO_STANDALONE #define _WEBSOCKETPP_CPP11_TYPE_TRAITS_ -#ifndef __linux__ #include -#endif #include #include #include From 6ccc44b9f95ccb8d029f4cde7d5500e0e4fab240 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 10 Oct 2024 11:08:29 -0400 Subject: [PATCH 12/20] Update asio URL_HASH in FetchWebsocketpp.cmake --- cmake/FetchWebsocketpp.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/FetchWebsocketpp.cmake b/cmake/FetchWebsocketpp.cmake index 24d6451..1cb63d7 100644 --- a/cmake/FetchWebsocketpp.cmake +++ b/cmake/FetchWebsocketpp.cmake @@ -11,6 +11,6 @@ FetchContent_MakeAvailable(websocketpp) FetchContent_Declare( asio URL https://github.com/chriskohlhoff/asio/archive/asio-1-28-0.tar.gz - URL_HASH SHA256=1ef87b17e5e32f1a1b4cd840acac6c2a8d0dcde365dde3f9dcd5d1eae0495290) + URL_HASH SHA256=226438b0798099ad2a202563a83571ce06dd13b570d8fded4840dbc1f97fa328) FetchContent_MakeAvailable(websocketpp asio) From 283bf341675dbff9f49f43d4a51e172fba749be1 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 10 Oct 2024 11:10:12 -0400 Subject: [PATCH 13/20] Refactor CMakeLists.txt to include the correct path for asio library --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c4890c..f86be5b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,7 +124,7 @@ if(WIN32 OR APPLE) else() include(cmake/FetchWebsocketpp.cmake) target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${websocketpp_SOURCE_DIR}/) - target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${asio_SOURCE_DIR}/) + target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE ${asio_SOURCE_DIR}/asio/include/) endif() target_sources( From f5dc4c89d705f93a99ef1cbbca521e060bad1692 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 10 Oct 2024 11:12:40 -0400 Subject: [PATCH 14/20] Refactor WebSocket server initialization and communication in stenographer.cpp --- src/stenographer/stenographer.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/stenographer/stenographer.cpp b/src/stenographer/stenographer.cpp index f5a57a9..95550eb 100644 --- a/src/stenographer/stenographer.cpp +++ b/src/stenographer/stenographer.cpp @@ -45,14 +45,14 @@ class TranscriptionHandler::Impl { messageCallback(callback), running(false) { - server.init_asio(); + wsServer.init_asio(); - server.set_open_handler([this](websocketpp::connection_hdl hdl) { + wsServer.set_open_handler([this](websocketpp::connection_hdl hdl) { std::lock_guard lock(mutex); connection = hdl; }); - server.set_message_handler( + wsServer.set_message_handler( [this](websocketpp::connection_hdl hdl, server::message_ptr msg) { UNUSED_PARAMETER(hdl); handleIncomingMessage(msg->get_payload()); @@ -69,9 +69,9 @@ class TranscriptionHandler::Impl { if (!running) { running = true; serverThread = std::async(std::launch::async, [this]() { - server.listen(9002); - server.start_accept(); - server.run(); + wsServer.listen(9002); + wsServer.start_accept(); + wsServer.run(); }); processingThread = @@ -83,7 +83,7 @@ class TranscriptionHandler::Impl { { if (running) { running = false; - server.stop(); + wsServer.stop(); if (serverThread.valid()) serverThread.wait(); if (processingThread.valid()) @@ -93,7 +93,7 @@ class TranscriptionHandler::Impl { private: transcription_filter_data *gf; - server server; + server wsServer; websocketpp::connection_hdl connection; MessageCallback messageCallback; std::queue> audioQueue; @@ -149,8 +149,8 @@ class TranscriptionHandler::Impl { start_timestamp_offset_ns}, {"end_timestamp", end_timestamp_offset_ns}}; if (connection.lock()) { - server.send(connection, timestampInfo.dump(), - websocketpp::frame::opcode::text); + wsServer.send(connection, timestampInfo.dump(), + websocketpp::frame::opcode::text); } sendAudioData(pcmData); } else { @@ -192,8 +192,8 @@ class TranscriptionHandler::Impl { std::memcpy(wavData.data() + sizeof(WAVHeader), audioBuffer.data(), wavHeader.data_size); - server.send(connection, wavData.data(), wavData.size(), - websocketpp::frame::opcode::binary); + wsServer.send(connection, wavData.data(), wavData.size(), + websocketpp::frame::opcode::binary); audioBuffer.clear(); } From e979fca72f2c9ee4d3e4a52e60a853f8906ddb73 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Thu, 10 Oct 2024 14:58:42 -0400 Subject: [PATCH 15/20] Refactor stenographer-util.h and stenographer-util.cpp --- CMakeLists.txt | 3 +- src/stenographer/stenographer-util.cpp | 67 ++++++++++++++++++++ src/stenographer/stenographer-util.h | 10 +++ src/stenographer/stenographer_interface.html | 24 ++++--- src/transcription-filter-callbacks.cpp | 2 - src/transcription-filter-properties.cpp | 4 +- src/transcription-filter.cpp | 46 +------------- src/transcription-utils.h | 28 ++++++++ 8 files changed, 126 insertions(+), 58 deletions(-) create mode 100644 src/stenographer/stenographer-util.cpp create mode 100644 src/stenographer/stenographer-util.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f86be5b..ab3efb3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,7 +153,8 @@ target_sources( src/ui/filter-replace-utils.cpp src/translation/translation-language-utils.cpp src/ui/filter-replace-dialog.cpp - src/stenographer/stenographer.cpp) + src/stenographer/stenographer.cpp + src/stenographer/stenographer-util.cpp) set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name}) diff --git a/src/stenographer/stenographer-util.cpp b/src/stenographer/stenographer-util.cpp new file mode 100644 index 0000000..59278cb --- /dev/null +++ b/src/stenographer/stenographer-util.cpp @@ -0,0 +1,67 @@ + +#include "stenographer-util.h" +#include "transcription-filter-data.h" +#include "transcription-utils.h" + +#include + +#include +#include + +/** + * @brief Applies a simple delay to the audio data for stenographer mode. + * + * This function stores the incoming audio data in a buffer and processes it after a specified delay. + * The delayed audio data is then emitted, replacing the original audio data in the buffer. + * If the buffer does not yet contain enough data to satisfy the delay, the audio buffer is filled with silence. + * + * @param gf Pointer to the transcription filter data structure containing the delay buffer and configuration. + * @param audio Pointer to the audio data structure containing the audio frames to be processed. + * @return Pointer to the processed audio data structure with the applied delay. + */ +struct obs_audio_data *stenographer_simple_delay(transcription_filter_data *gf, + struct obs_audio_data *audio) +{ + // Stenographer mode - apply delay. + // Store the audio data in a buffer and process it after the delay. + // push the data to the back of gf->stenographer_delay_buffer + for (size_t c = 0; c < gf->channels; c++) { + // take a audio->frames * sizeof(float) bytes chunk from audio->data[c] and push it + // to the back of the buffer as a float + std::vector audio_data_chunk((float *)audio->data[c], + ((float *)audio->data[c]) + audio->frames); + gf->stenographer_delay_buffers[c].insert(gf->stenographer_delay_buffers[c].end(), + audio_data_chunk.begin(), + audio_data_chunk.end()); + } + + // If the buffer is larger than the delay, emit the oldest data + // Take from the buffer as much as requested by the incoming audio data + size_t delay_frames = + (size_t)((float)gf->sample_rate * (float)gf->stenographer_delay_ms / 1000.0f) + + audio->frames; + + if (gf->stenographer_delay_buffers[0].size() >= delay_frames) { + // Replace data on the audio buffer with the delayed data + for (size_t c = 0; c < gf->channels; c++) { + // take exatcly audio->frames from the buffer + std::vector audio_data(gf->stenographer_delay_buffers[c].begin(), + gf->stenographer_delay_buffers[c].begin() + + audio->frames); + // remove the oldest buffers from the delay buffer + gf->stenographer_delay_buffers[c].erase( + gf->stenographer_delay_buffers[c].begin(), + gf->stenographer_delay_buffers[c].begin() + audio->frames); + + // replace the data on the audio buffer with the delayed data + memcpy(audio->data[c], audio_data.data(), + audio_data.size() * sizeof(float)); + } + } else { + // Fill the audio buffer with silence + for (size_t c = 0; c < gf->channels; c++) { + memset(audio->data[c], 0, audio->frames * sizeof(float)); + } + } + return audio; +} diff --git a/src/stenographer/stenographer-util.h b/src/stenographer/stenographer-util.h new file mode 100644 index 0000000..3d0fd27 --- /dev/null +++ b/src/stenographer/stenographer-util.h @@ -0,0 +1,10 @@ +#ifndef STENOGRAPHER_UTIL_H +#define STENOGRAPHER_UTIL_H + +struct transcription_filter_data; +struct obs_audio_data; + +struct obs_audio_data *stenographer_simple_delay(transcription_filter_data *gf, + struct obs_audio_data *audio); + +#endif /* STENOGRAPHER_UTIL_H */ \ No newline at end of file diff --git a/src/stenographer/stenographer_interface.html b/src/stenographer/stenographer_interface.html index 0b5eea8..c6ca386 100644 --- a/src/stenographer/stenographer_interface.html +++ b/src/stenographer/stenographer_interface.html @@ -15,14 +15,18 @@

Stenographer Interface

- +
+ + +
- +
Timestamp (s):
+
Connection Status: Disconnected
Audio Status: Not started