Skip to content

Commit

Permalink
English language selection by model (#108)
Browse files Browse the repository at this point in the history
* refactor: Improve remove_leading_trailing_nonalpha function in transcription-utils.cpp

* refactor: Set whisper language to English in transcription filter properties
  • Loading branch information
royshil authored Jun 11, 2024
1 parent ecb3dfc commit 845c1a8
Show file tree
Hide file tree
Showing 6 changed files with 449 additions and 27 deletions.
14 changes: 14 additions & 0 deletions src/transcription-filter-utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,17 @@ void create_obs_text_source()
}
obs_source_release(scene_as_source);
}

bool add_sources_to_list(void *list_property, obs_source_t *source)
{
auto source_id = obs_source_get_id(source);
if (strcmp(source_id, "text_ft2_source_v2") != 0 &&
strcmp(source_id, "text_gdiplus_v2") != 0) {
return true;
}

obs_property_t *sources = (obs_property_t *)list_property;
const char *name = obs_source_get_name(source);
obs_property_list_add_string(sources, name, name);
return true;
}
3 changes: 3 additions & 0 deletions src/transcription-filter-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define TRANSCRIPTION_FILTER_UTILS_H

#include <media-io/audio-io.h>
#include <obs.h>

// Convert channels number to a speaker layout
inline enum speaker_layout convert_speaker_layout(uint8_t channels)
Expand Down Expand Up @@ -30,4 +31,6 @@ inline enum speaker_layout convert_speaker_layout(uint8_t channels)

void create_obs_text_source();

bool add_sources_to_list(void *list_property, obs_source_t *source);

#endif // TRANSCRIPTION_FILTER_UTILS_H
49 changes: 23 additions & 26 deletions src/transcription-filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,6 @@
#include "translation/translation.h"
#include "translation/translation-includes.h"

bool add_sources_to_list(void *list_property, obs_source_t *source)
{
auto source_id = obs_source_get_id(source);
if (strcmp(source_id, "text_ft2_source_v2") != 0 &&
strcmp(source_id, "text_gdiplus_v2") != 0) {
return true;
}

obs_property_t *sources = (obs_property_t *)list_property;
const char *name = obs_source_get_name(source);
obs_property_list_add_string(sources, name, name);
return true;
}

void set_source_signals(transcription_filter_data *gf, obs_source_t *parent_source)
{
signal_handler_t *sh = obs_source_get_signal_handler(parent_source);
Expand Down Expand Up @@ -618,6 +604,26 @@ obs_properties_t *transcription_filter_properties(void *data)
} else {
obs_property_set_visible(
obs_properties_get(props, "whisper_model_path_external"), false);
const std::string model_name = new_model_path;
// if the model is english-only -> hide all the languages but english
const bool is_english_only =
(model_name.find("English") != std::string::npos);
// clear the language selection list ("whisper_language_select")
obs_property_t *prop_lang =
obs_properties_get(props, "whisper_language_select");
obs_property_list_clear(prop_lang);
if (is_english_only) {
// add only the english language
obs_property_list_add_string(prop_lang, "English", "en");
// set the language to english
obs_data_set_string(settings, "whisper_language_select", "en");
} else {
// add all the languages
for (const auto &lang : whisper_available_lang) {
obs_property_list_add_string(prop_lang, lang.second.c_str(),
lang.first.c_str());
}
}
}
return true;
});
Expand Down Expand Up @@ -759,7 +765,7 @@ obs_properties_t *transcription_filter_properties(void *data)
{"whisper_params_group", "log_words", "caption_to_stream", "buffer_size_msec",
"overlap_size_msec", "step_by_step_processing", "min_sub_duration",
"process_while_muted", "buffered_output", "vad_enabled", "log_level",
"suppress_sentences", "sentence_psum_accept_thresh"}) {
"suppress_sentences", "sentence_psum_accept_thresh", "vad_threshold"}) {
obs_property_set_visible(obs_properties_get(props, prop_name.c_str()),
show_hide);
}
Expand Down Expand Up @@ -825,18 +831,9 @@ obs_properties_t *transcription_filter_properties(void *data)
obs_property_t *whisper_language_select_list = obs_properties_add_list(
whisper_params_group, "whisper_language_select", MT_("language"),
OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
// sort the languages by flipping the map
std::map<std::string, std::string> whisper_available_lang_flip;
for (auto const &pair : whisper_available_lang) {
whisper_available_lang_flip[pair.second] = pair.first;
}
// iterate over all available languages and add them to the list
for (auto const &pair : whisper_available_lang_flip) {
// Capitalize the language name
std::string language_name = pair.first;
language_name[0] = (char)toupper(language_name[0]);

obs_property_list_add_string(whisper_language_select_list, language_name.c_str(),
for (auto const &pair : whisper_available_lang_reverse) {
obs_property_list_add_string(whisper_language_select_list, pair.first.c_str(),
pair.second.c_str());
}

Expand Down
2 changes: 2 additions & 0 deletions src/transcription-utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
#include <algorithm>
#include <vector>

// clang-format off
#define is_lead_byte(c) (((c)&0xe0) == 0xc0 || ((c)&0xf0) == 0xe0 || ((c)&0xf8) == 0xf0)
// clang-format off
#define is_trail_byte(c) (((c)&0xc0) == 0x80)

inline int lead_byte_length(const uint8_t c)
Expand Down
2 changes: 2 additions & 0 deletions src/whisper-utils/silero-vad-onnx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

// #define __DEBUG_SPEECH_PROB___

// prevent clang-format from reformatting the code
// clang-format off
timestamp_t::timestamp_t(int start_, int end_) : start(start_), end(end_){};

// assignment operator modifies object, therefore non-const
Expand Down
Loading

0 comments on commit 845c1a8

Please sign in to comment.