diff --git a/data/locale/ar-SA.ini b/data/locale/ar-SA.ini index 5c08e38..770d659 100644 --- a/data/locale/ar-SA.ini +++ b/data/locale/ar-SA.ini @@ -51,3 +51,4 @@ translate_add_context="الترجمة مع السياق" whisper_translate="ترجمة إلى الإنجليزية (Whisper)" buffer_size_msec="حجم الذاكرة المؤقتة (ملي ثانية)" overlap_size_msec="حجم التداخل (ملي ثانية)" +buffer_output_type="نوع مخرجات الذاكرة المؤقتة" diff --git a/data/locale/de-DE.ini b/data/locale/de-DE.ini index df2e450..d171104 100644 --- a/data/locale/de-DE.ini +++ b/data/locale/de-DE.ini @@ -51,3 +51,4 @@ translate_add_context="Mit Kontext übersetzen" whisper_translate="Ins Englische übersetzen (Flüstern)" buffer_size_msec="Puffergröße (ms)" overlap_size_msec="Überlappungsgröße (ms)" +buffer_output_type="Pufferausgabetyp" diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index 03efa24..9db3e12 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -70,4 +70,5 @@ translation_no_repeat_ngram_size="No-repeat ngram size" translation_max_input_length="Max input length" buffered_output_parameters="Buffered output parameters" buffer_num_lines="Number of lines" -buffer_num_chars_per_line="Characters per line" +buffer_num_chars_per_line="Amount per line" +buffer_output_type="Output type" diff --git a/data/locale/es-ES.ini b/data/locale/es-ES.ini index 59dc350..120677d 100644 --- a/data/locale/es-ES.ini +++ b/data/locale/es-ES.ini @@ -51,3 +51,4 @@ translate_add_context="Traducir con contexto" whisper_translate="Traducir al inglés (Whisper)" buffer_size_msec="Tamaño del búfer (ms)" overlap_size_msec="Tamaño de superposición (ms)" +buffer_output_type="Tipo de salida de búfer" diff --git a/data/locale/fr-FR.ini b/data/locale/fr-FR.ini index 0b6cd0c..3dbbd88 100644 --- a/data/locale/fr-FR.ini +++ b/data/locale/fr-FR.ini @@ -51,3 +51,4 @@ translate_add_context="Traduire avec contexte" whisper_translate="Traduire en anglais (Whisper)" buffer_size_msec="Taille du tampon (ms)" overlap_size_msec="Taille de chevauchement (ms)" +buffer_output_type="Type de sortie du tampon" diff --git a/data/locale/hi-IN.ini b/data/locale/hi-IN.ini index a7b2d26..d265128 100644 --- a/data/locale/hi-IN.ini +++ b/data/locale/hi-IN.ini @@ -51,3 +51,4 @@ translate_add_context="संदर्भ के साथ अनुवाद whisper_translate="अंग्रेजी में अनुवाद करें (व्हिस्पर)" buffer_size_msec="बफ़र आकार (ms)" overlap_size_msec="ओवरलैप आकार (ms)" +buffer_output_type="बफ़र आउटपुट प्रकार" diff --git a/data/locale/ja-JP.ini b/data/locale/ja-JP.ini index 18445bc..7887bfc 100644 --- a/data/locale/ja-JP.ini +++ b/data/locale/ja-JP.ini @@ -51,3 +51,4 @@ translate_add_context="コンテキスト付きで翻訳" whisper_translate="英語に翻訳(ウィスパー)" buffer_size_msec="バッファサイズ(ms)" overlap_size_msec="オーバーラップサイズ(ms)" +buffer_output_type="バッファ出力タイプ" diff --git a/data/locale/ko-KR.ini b/data/locale/ko-KR.ini index 8dbe564..60cd002 100644 --- a/data/locale/ko-KR.ini +++ b/data/locale/ko-KR.ini @@ -51,3 +51,4 @@ translate_add_context="컨텍스트와 함께 번역" whisper_translate="영어로 번역 (속삭임)" buffer_size_msec="버퍼 크기 (ms)" overlap_size_msec="오버랩 크기 (ms)" +buffer_output_type="버퍼 출력 유형" diff --git a/data/locale/pl-PL.ini b/data/locale/pl-PL.ini index 64ee55b..752b787 100644 --- a/data/locale/pl-PL.ini +++ b/data/locale/pl-PL.ini @@ -51,3 +51,4 @@ translate_add_context="Tłumacz z kontekstem" whisper_translate="Tłumacz na angielski (Whisper)" buffer_size_msec="Rozmiar bufora (ms)" overlap_size_msec="Rozmiar nakładki (ms)" +buffer_output_type="Typ wyjścia bufora" diff --git a/data/locale/pt-BR.ini b/data/locale/pt-BR.ini index 2f0a27e..cd9360c 100644 --- a/data/locale/pt-BR.ini +++ b/data/locale/pt-BR.ini @@ -51,3 +51,4 @@ translate_add_context="Traduzir com contexto" whisper_translate="Traduzir para inglês (Whisper)" buffer_size_msec="Tamanho do buffer (ms)" overlap_size_msec="Tamanho da sobreposição (ms)" +buffer_output_type="Tipo de saída do buffer" diff --git a/data/locale/ru-RU.ini b/data/locale/ru-RU.ini index 23090b6..543200d 100644 --- a/data/locale/ru-RU.ini +++ b/data/locale/ru-RU.ini @@ -50,3 +50,4 @@ translate_add_context="Перевести с контекстом" whisper_translate="Перевести на английский (Whisper)" buffer_size_msec="Размер буфера (мс)" overlap_size_msec="Размер перекрытия (мс)" +buffer_output_type="Тип выходных данных буфера" diff --git a/data/locale/zh-CN.ini b/data/locale/zh-CN.ini index a561f35..48e8dfa 100644 --- a/data/locale/zh-CN.ini +++ b/data/locale/zh-CN.ini @@ -51,3 +51,4 @@ translate_add_context="带上下文翻译" whisper_translate="翻译为英语(Whisper)" buffer_size_msec="缓冲区大小(毫秒)" overlap_size_msec="重叠大小(毫秒)" +buffer_output_type="缓冲区输出类型" diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index 9cb26c0..c16f9cd 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -108,6 +108,8 @@ struct transcription_filter_data { TokenBufferThread captions_monitor; int buffered_output_num_lines = 2; int buffered_output_num_chars = 30; + TokenBufferSegmentation buffered_output_output_type = + TokenBufferSegmentation::SEGMENTATION_TOKEN; // ctor transcription_filter_data() : whisper_buf_mutex(), whisper_ctx_mutex(), wshiper_thread_cv() diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index e03e819..e62e5b9 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -167,9 +167,9 @@ void transcription_filter_destroy(void *data) void transcription_filter_update(void *data, obs_data_t *s) { - obs_log(LOG_INFO, "LocalVocal filter update"); struct transcription_filter_data *gf = static_cast(data); + obs_log(gf->log_level, "LocalVocal filter update"); gf->log_level = (int)obs_data_get_int(s, "log_level"); gf->vad_enabled = obs_data_get_bool(s, "vad_enabled"); @@ -188,11 +188,13 @@ void transcription_filter_update(void *data, obs_data_t *s) bool new_buffered_output = obs_data_get_bool(s, "buffered_output"); int new_buffer_num_lines = (int)obs_data_get_int(s, "buffer_num_lines"); int new_buffer_num_chars_per_line = (int)obs_data_get_int(s, "buffer_num_chars_per_line"); + TokenBufferSegmentation new_buffer_output_type = + (TokenBufferSegmentation)obs_data_get_int(s, "buffer_output_type"); if (new_buffered_output) { - obs_log(LOG_INFO, "buffered_output enable"); + obs_log(gf->log_level, "buffered_output enable"); if (!gf->buffered_output || !gf->captions_monitor.isEnabled()) { - obs_log(LOG_INFO, "buffered_output currently disabled, enabling"); + obs_log(gf->log_level, "buffered_output currently disabled, enabling"); gf->buffered_output = true; gf->captions_monitor.initialize( gf, @@ -203,18 +205,23 @@ void transcription_filter_update(void *data, obs_data_t *s) } }, new_buffer_num_lines, new_buffer_num_chars_per_line, - std::chrono::seconds(10)); + std::chrono::seconds(3), new_buffer_output_type); } else { if (new_buffer_num_lines != gf->buffered_output_num_lines || - new_buffer_num_chars_per_line != gf->buffered_output_num_chars) { - obs_log(LOG_INFO, "buffered_output parameters changed, updating"); + new_buffer_num_chars_per_line != gf->buffered_output_num_chars || + new_buffer_output_type != gf->buffered_output_output_type) { + obs_log(gf->log_level, + "buffered_output parameters changed, updating"); + gf->captions_monitor.clear(); gf->captions_monitor.setNumSentences(new_buffer_num_lines); gf->captions_monitor.setNumPerSentence( new_buffer_num_chars_per_line); - gf->buffered_output_num_lines = new_buffer_num_lines; - gf->buffered_output_num_chars = new_buffer_num_chars_per_line; + gf->captions_monitor.setSegmentation(new_buffer_output_type); } } + gf->buffered_output_num_lines = new_buffer_num_lines; + gf->buffered_output_num_chars = new_buffer_num_chars_per_line; + gf->buffered_output_output_type = new_buffer_output_type; } else { obs_log(gf->log_level, "buffered_output disable"); if (gf->buffered_output) { @@ -349,13 +356,23 @@ void transcription_filter_update(void *data, obs_data_t *s) } } - if (gf->initial_creation && gf->context != nullptr && obs_source_enabled(gf->context)) { - obs_log(LOG_INFO, "Initial filter creation and source enabled"); + if (gf->context != nullptr && obs_source_enabled(gf->context)) { + if (gf->initial_creation) { + obs_log(LOG_INFO, "Initial filter creation and source enabled"); - // source was enabled on creation - update_whisper_model(gf); - gf->active = true; - gf->initial_creation = false; + // source was enabled on creation + update_whisper_model(gf); + gf->active = true; + gf->initial_creation = false; + } else { + // check if the whisper model selection has changed + const std::string new_model_path = + obs_data_get_string(s, "whisper_model_path"); + if (gf->whisper_model_path != new_model_path) { + obs_log(LOG_INFO, "New model selected: %s", new_model_path.c_str()); + update_whisper_model(gf); + } + } } } @@ -506,9 +523,11 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_bool(s, "buffered_output", false); obs_data_set_default_int(s, "buffer_num_lines", 2); obs_data_set_default_int(s, "buffer_num_chars_per_line", 30); + obs_data_set_default_int(s, "buffer_output_type", + (int)TokenBufferSegmentation::SEGMENTATION_TOKEN); obs_data_set_default_bool(s, "vad_enabled", true); - obs_data_set_default_double(s, "vad_threshold", 0.5); + obs_data_set_default_double(s, "vad_threshold", 0.65); obs_data_set_default_int(s, "log_level", LOG_DEBUG); obs_data_set_default_bool(s, "log_words", false); obs_data_set_default_bool(s, "caption_to_stream", false); @@ -669,6 +688,16 @@ obs_properties_t *transcription_filter_properties(void *data) return true; }); + // Add language selector + obs_property_t *whisper_language_select_list = + obs_properties_add_list(ppts, "whisper_language_select", MT_("language"), + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); + // iterate over all available languages and add them to the list + for (auto const &pair : whisper_available_lang_reverse) { + obs_property_list_add_string(whisper_language_select_list, pair.first.c_str(), + pair.second.c_str()); + } + // add translation option group obs_properties_t *translation_group = obs_properties_create(); obs_property_t *translation_group_prop = obs_properties_add_group( @@ -806,7 +835,8 @@ obs_properties_t *transcription_filter_properties(void *data) {"whisper_params_group", "log_words", "caption_to_stream", "buffer_size_msec", "overlap_size_msec", "step_by_step_processing", "min_sub_duration", "process_while_muted", "buffered_output", "vad_enabled", "log_level", - "suppress_sentences", "sentence_psum_accept_thresh", "vad_threshold"}) { + "suppress_sentences", "sentence_psum_accept_thresh", "vad_threshold", + "buffered_output_group"}) { obs_property_set_visible(obs_properties_get(props, prop_name.c_str()), show_hide); } @@ -820,6 +850,12 @@ obs_properties_t *transcription_filter_properties(void *data) obs_properties_t *buffered_output_group = obs_properties_create(); obs_properties_add_group(ppts, "buffered_output_group", MT_("buffered_output_parameters"), OBS_GROUP_NORMAL, buffered_output_group); + // add buffer "type" character or word + obs_property_t *buffer_type_list = obs_properties_add_list( + buffered_output_group, "buffer_output_type", MT_("buffer_output_type"), + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); + obs_property_list_add_int(buffer_type_list, "Character", SEGMENTATION_TOKEN); + obs_property_list_add_int(buffer_type_list, "Word", SEGMENTATION_WORD); // add buffer lines parameter obs_properties_add_int_slider(buffered_output_group, "buffer_num_lines", MT_("buffer_num_lines"), 1, 5, 1); @@ -868,16 +904,6 @@ obs_properties_t *transcription_filter_properties(void *data) obs_properties_add_group(ppts, "whisper_params_group", MT_("whisper_parameters"), OBS_GROUP_NORMAL, whisper_params_group); - // Add language selector - obs_property_t *whisper_language_select_list = obs_properties_add_list( - whisper_params_group, "whisper_language_select", MT_("language"), - OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); - // iterate over all available languages and add them to the list - for (auto const &pair : whisper_available_lang_reverse) { - obs_property_list_add_string(whisper_language_select_list, pair.first.c_str(), - pair.second.c_str()); - } - obs_property_t *whisper_sampling_method_list = obs_properties_add_list( whisper_params_group, "whisper_sampling_method", MT_("whisper_sampling_method"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); diff --git a/src/transcription-utils.h b/src/transcription-utils.h index 4e7f39c..e81b909 100644 --- a/src/transcription-utils.h +++ b/src/transcription-utils.h @@ -4,6 +4,8 @@ #include #include #include +#include +#include // Fix UTF8 string for Windows std::string fix_utf8(const std::string &str); @@ -25,4 +27,18 @@ inline uint64_t now_ms() // Split a string into words based on spaces std::vector split_words(const std::string &str_copy); +// trim (strip) string from leading and trailing whitespaces +template StringLike trim(const StringLike &str) +{ + StringLike str_copy = str; + str_copy.erase(str_copy.begin(), + std::find_if(str_copy.begin(), str_copy.end(), + [](unsigned char ch) { return !std::isspace(ch); })); + str_copy.erase(std::find_if(str_copy.rbegin(), str_copy.rend(), + [](unsigned char ch) { return !std::isspace(ch); }) + .base(), + str_copy.end()); + return str_copy; +} + #endif // TRANSCRIPTION_UTILS_H diff --git a/src/whisper-utils/token-buffer-thread.cpp b/src/whisper-utils/token-buffer-thread.cpp index e88d76c..b2a4093 100644 --- a/src/whisper-utils/token-buffer-thread.cpp +++ b/src/whisper-utils/token-buffer-thread.cpp @@ -4,6 +4,7 @@ #include "token-buffer-thread.h" #include "whisper-utils.h" +#include "transcription-utils.h" #include @@ -18,12 +19,13 @@ TokenBufferThread::TokenBufferThread() noexcept : gf(nullptr), - numSentences(1), - numPerSentence(1), + numSentences(2), + numPerSentence(30), maxTime(0), stop(true), presentationQueueMutex(), - inputQueueMutex() + inputQueueMutex(), + segmentation(SEGMENTATION_TOKEN) { } @@ -110,6 +112,8 @@ void TokenBufferThread::clear() std::lock_guard lock(presentationQueueMutex); presentationQueue.clear(); } + this->lastCaption = ""; + this->lastCaptionTime = std::chrono::steady_clock::now(); this->callback(""); } @@ -134,6 +138,13 @@ void TokenBufferThread::monitor() for (size_t i = 0; i < this->numPerSentence; i++) { presentationQueue.pop_front(); } + if (this->segmentation == SEGMENTATION_TOKEN) { + // pop tokens until a space is found + while (!presentationQueue.empty() && + presentationQueue.front() != SPACE) { + presentationQueue.pop_front(); + } + } } { @@ -146,10 +157,23 @@ void TokenBufferThread::monitor() for (const auto &token : inputQueue) { presentationQueue.push_back(token); } - } else { + } else if (this->segmentation == SEGMENTATION_TOKEN) { // add one token to the presentation queue presentationQueue.push_back(inputQueue.front()); inputQueue.pop_front(); + } else { + // skip spaces in the beginning of the input queue + while (inputQueue.front() == SPACE) { + inputQueue.pop_front(); + } + // add one word to the presentation queue + TokenBufferString word; + while (!inputQueue.empty() && + inputQueue.front() != SPACE) { + word += inputQueue.front(); + inputQueue.pop_front(); + } + presentationQueue.push_back(word); } } } @@ -158,49 +182,64 @@ void TokenBufferThread::monitor() // build a caption from the presentation queue in sentences // with a maximum of numPerSentence tokens/words per sentence // and a newline between sentences - TokenBufferString caption; + std::vector sentences(1); + if (this->segmentation == SEGMENTATION_WORD) { - // iterate through the presentation queue tokens and make words (based on spaces) - // then build a caption with a maximum of numPerSentence words per sentence + // add words from the presentation queue to the sentences + // if a sentence is full - start a new one size_t wordsInSentence = 0; - TokenBufferString word; - for (const auto &token : presentationQueue) { - // keep adding tokens to the word until a space is found - word += token; - if (word.find(SPACE) != TokenBufferString::npos) { - // cut the word at the space and add it to the caption - caption += word.substr(0, word.find(SPACE)); - wordsInSentence++; - // keep the rest of the word for the next iteration - word = word.substr(word.find(SPACE) + 1); - - if (wordsInSentence == - this->numPerSentence) { - caption += word; - caption += SPACE; - wordsInSentence = 0; - word.clear(); - } + for (size_t i = 0; i < presentationQueue.size(); i++) { + const auto &word = presentationQueue[i]; + sentences.back() += word + SPACE; + wordsInSentence++; + if (wordsInSentence == this->numPerSentence) { + sentences.push_back(TokenBufferString()); } } } else { // iterate through the presentation queue tokens and build a caption - size_t tokensInSentence = 0; - for (const auto &token : presentationQueue) { + for (size_t i = 0; i < presentationQueue.size(); i++) { + const auto &token = presentationQueue[i]; // skip spaces in the beginning of a sentence (tokensInSentence == 0) - if (token == SPACE && tokensInSentence == 0) { + if (token == SPACE && + sentences.back().length() == 0) { continue; } - caption += token; - tokensInSentence++; - if (tokensInSentence == this->numPerSentence) { - caption += NEWLINE; - tokensInSentence = 0; + sentences.back() += token; + if (sentences.back().length() == + this->numPerSentence) { + // if the next character is not a space - this is a broken word + // roll back to the last space, replace it with a newline + size_t lastSpace = + sentences.back().find_last_of( + SPACE); + sentences.push_back(sentences.back().substr( + lastSpace + 1)); + sentences[sentences.size() - 2] = + sentences[sentences.size() - 2] + .substr(0, lastSpace); } } } + TokenBufferString caption; + // if there are more sentences than numSentences - remove the oldest ones + while (sentences.size() > this->numSentences) { + sentences.erase(sentences.begin()); + } + // if there are less sentences than numSentences - add empty sentences + while (sentences.size() < this->numSentences) { + sentences.push_back(TokenBufferString()); + } + // build the caption from the sentences + for (const auto &sentence : sentences) { + if (!sentence.empty()) { + caption += trim(sentence); + } + caption += NEWLINE; + } + #ifdef _WIN32 // convert caption to multibyte for obs int count = WideCharToMultiByte(CP_UTF8, 0, caption.c_str(), @@ -222,19 +261,59 @@ void TokenBufferThread::monitor() if (caption_out.empty()) { // if no caption was built, sleep for a while + this->lastCaption = ""; + this->lastCaptionTime = std::chrono::steady_clock::now(); std::this_thread::sleep_for(std::chrono::milliseconds(100)); continue; } - // emit the caption - this->callback(caption_out); + if (caption_out == lastCaption) { + // if it has been max_time since the last caption - clear the presentation queue + if (this->maxTime.count() > 0) { + auto now = std::chrono::steady_clock::now(); + auto duration = std::chrono::duration_cast( + now - this->lastCaptionTime); + if (duration > this->maxTime) { + this->clear(); + } + } + } else { + // emit the caption + this->callback(caption_out); + this->lastCaption = caption_out; + this->lastCaptionTime = std::chrono::steady_clock::now(); + } // check the input queue size (iqs), if it's big - sleep less - std::this_thread::sleep_for(std::chrono::milliseconds(inputQueue.size() > 30 ? 33 - : inputQueue.size() > 15 - ? 66 - : 100)); + std::this_thread::sleep_for(std::chrono::milliseconds( + inputQueue.size() > 30 ? getWaitTime(SPEED_FAST) + : inputQueue.size() > 15 ? getWaitTime(SPEED_NORMAL) + : getWaitTime(SPEED_SLOW))); } obs_log(LOG_INFO, "TokenBufferThread::monitor: done"); } + +int TokenBufferThread::getWaitTime(TokenBufferSpeed speed) const +{ + if (this->segmentation == SEGMENTATION_WORD) { + switch (speed) { + case SPEED_SLOW: + return 200; + case SPEED_NORMAL: + return 150; + case SPEED_FAST: + return 100; + } + } else if (this->segmentation == SEGMENTATION_TOKEN) { + switch (speed) { + case SPEED_SLOW: + return 100; + case SPEED_NORMAL: + return 66; + case SPEED_FAST: + return 33; + } + } + return 1000; +} diff --git a/src/whisper-utils/token-buffer-thread.h b/src/whisper-utils/token-buffer-thread.h index 0dbe14e..a27b244 100644 --- a/src/whisper-utils/token-buffer-thread.h +++ b/src/whisper-utils/token-buffer-thread.h @@ -23,6 +23,7 @@ typedef std::string TokenBufferString; struct transcription_filter_data; enum TokenBufferSegmentation { SEGMENTATION_WORD = 0, SEGMENTATION_TOKEN, SEGMENTATION_SENTENCE }; +enum TokenBufferSpeed { SPEED_SLOW = 0, SPEED_NORMAL, SPEED_FAST }; class TokenBufferThread { public: @@ -43,10 +44,16 @@ class TokenBufferThread { void setNumSentences(size_t numSentences_) { numSentences = numSentences_; } void setNumPerSentence(size_t numPerSentence_) { numPerSentence = numPerSentence_; } + void setMaxTime(std::chrono::seconds maxTime_) { maxTime = maxTime_; } + void setSegmentation(TokenBufferSegmentation segmentation_) + { + segmentation = segmentation_; + } private: void monitor(); void log_token_vector(const std::vector &tokens); + int getWaitTime(TokenBufferSpeed speed) const; struct transcription_filter_data *gf; std::deque inputQueue; std::deque presentationQueue; @@ -61,6 +68,9 @@ class TokenBufferThread { size_t numSentences; size_t numPerSentence; TokenBufferSegmentation segmentation; + // timestamp of the last caption + std::chrono::time_point lastCaptionTime; + std::string lastCaption; }; #endif