Skip to content

Commit

Permalink
Update README and refactor timed metadata handling for improved trans…
Browse files Browse the repository at this point in the history
…lation modes
  • Loading branch information
royshil committed Dec 5, 2024
1 parent 8c8113d commit 250b3e0
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 12 deletions.
19 changes: 14 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ Current Features:
- Support for 100+ languages with dialect recognition
- Streaming-optimized performance with minimal latency
- Multiple cloud provider options for transcription and translation
- Caption output in multiple formats (.txt, .srt, .vtt)
- Caption output in multiple formats (.txt, .srt)
- Sync'ed captions with OBS recording timestamps
- Direct streaming to platforms (YouTube, Twitch) with embedded captions
- Advanced text filtering and customization options
- Partial transcriptions for a streaming-captions experience
- Custom vocabulary and pronunciation support
- Professional terminology handling for specific industries

Roadmap:
- Custom vocabulary and pronunciation support
- Professional terminology handling for specific industries
- Advanced text filtering and customization options
- Speaker diarization for multi-speaker environments
- Advanced profanity filtering options
- Custom translation glossaries
Expand Down Expand Up @@ -86,8 +86,17 @@ $ ./.github/scripts/build-linux

### Windows

Windows also needs Conan for OpenSSL. Run `conan` to get the dependency (make sure to run `conan` on the `conanfile_win.txt`):
```powershell
> pip install conan
> conan profile detect --force
> conan install .\conanfile_win.txt --output-folder=./build_conan --build=missing -g CMakeDeps
```

Build the plugin:

```powershell
> .github/scripts/Build-Windows.ps1 -Configuration Release
> .\.github\scripts\Build-Windows.ps1 -Configuration Release
```

If you're developing the plugin, I find this command to be useful for direct deploymet into OBS after building:
Expand Down
14 changes: 14 additions & 0 deletions src/cloudvocal-callbacks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "cloudvocal-callbacks.h"
#include "cloudvocal-utils.h"
#include "plugin-support.h"
#include "timed-metadata/timed-metadata-utils.h"

void send_caption_to_source(const std::string &target_source_name, const std::string &caption,
struct cloudvocal_data *gf)
Expand Down Expand Up @@ -217,6 +218,8 @@ void set_text_callback(struct cloudvocal_data *gf, const DetectionResultWithText
}
}

// should translate if translation is enabled and the result is full
// or if partial translations are enabled
bool should_translate = (gf->translate_only_full_sentences
? result.result == DETECTION_RESULT_SPEECH
: true) &&
Expand All @@ -239,7 +242,18 @@ void set_text_callback(struct cloudvocal_data *gf, const DetectionResultWithText
translated_sentence_cloud,
gf->target_lang);
}
if (gf->send_timed_metadata) {
send_timed_metadata_to_server(gf, SOURCE_AND_TARGET,
result.text, result.language,
translated_sentence_cloud,
gf->target_lang);
}
});
} else {
if (gf->send_timed_metadata) {
send_timed_metadata_to_server(gf, ONLY_SOURCE, result.text, result.language,
"", "");
}
}

// send the original text to the output
Expand Down
11 changes: 5 additions & 6 deletions src/timed-metadata/timed-metadata-utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,8 @@ void send_timed_metadata_to_ivs_endpoint(struct cloudvocal_data *gf, Translation

// Construct the inner JSON string
nlohmann::json inner_meta_data;
if (mode == NON_WHISPER_TRANSLATE) {
obs_log(gf->log_level,
"send_timed_metadata_to_ivs_endpoint - NON_WHISPER_TRANSLATE");
if (mode == SOURCE_AND_TARGET) {
obs_log(gf->log_level, "send_timed_metadata_to_ivs_endpoint - SOURCE_AND_TARGET");
nlohmann::json array;
if (!source_text.empty()) {
array.push_back({{"language", source_lang}, {"text", source_text}});
Expand All @@ -146,13 +145,13 @@ void send_timed_metadata_to_ivs_endpoint(struct cloudvocal_data *gf, Translation
return;
}
inner_meta_data = {{"captions", array}};
} else if (mode == WHISPER_TRANSLATE) {
} else if (mode == ONLY_TARGET) {
if (target_text.empty()) {
obs_log(gf->log_level,
"send_timed_metadata_to_ivs_endpoint - target text empty");
return;
}
obs_log(gf->log_level, "send_timed_metadata_to_ivs_endpoint - WHISPER_TRANSLATE");
obs_log(gf->log_level, "send_timed_metadata_to_ivs_endpoint - ONLY_TARGET");
inner_meta_data = {
{"captions", {{{"language", target_lang}, {"text", target_text}}}}};
} else {
Expand All @@ -161,7 +160,7 @@ void send_timed_metadata_to_ivs_endpoint(struct cloudvocal_data *gf, Translation
"send_timed_metadata_to_ivs_endpoint - source text empty");
return;
}
obs_log(gf->log_level, "send_timed_metadata_to_ivs_endpoint - transcription mode");
obs_log(gf->log_level, "send_timed_metadata_to_ivs_endpoint - ONLY_SOURCE");
inner_meta_data = {
{"captions", {{{"language", source_lang}, {"text", source_text}}}}};
}
Expand Down
2 changes: 1 addition & 1 deletion src/timed-metadata/timed-metadata-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

#include "cloudvocal-data.h"

enum Translation_Mode { WHISPER_TRANSLATE, NON_WHISPER_TRANSLATE, TRANSCRIBE };
enum Translation_Mode { ONLY_TARGET, SOURCE_AND_TARGET, ONLY_SOURCE };

void send_timed_metadata_to_server(struct cloudvocal_data *gf, Translation_Mode mode,
const std::string &source_text, const std::string &source_lang,
Expand Down

0 comments on commit 250b3e0

Please sign in to comment.