Merge commit '4e3fdcd6ef47cf4b012f3d3dfa19ac0f005dc080' into roy.time…

…d_metadata
locaal-ai · Jul 17, 2024 · 9055ebe · 9055ebe
2 parents 71f2fea + 4e3fdcd
commit 9055ebe
Show file tree

Hide file tree

Showing 6 changed files with 600 additions and 505 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -90,6 +90,7 @@ target_sources(
           src/transcription-filter.cpp
           src/transcription-filter.c
           src/transcription-filter-callbacks.cpp
+          src/transcription-filter-properties.cpp
           src/transcription-filter-utils.cpp
           src/transcription-utils.cpp
           src/model-utils/model-downloader.cpp

diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
@@ -53,7 +53,7 @@ whisper_translate="Translate to English (Whisper)"
 buffer_size_msec="Buffer size (ms)"
 overlap_size_msec="Overlap size (ms)"
 suppress_sentences="Suppress sentences (each line)"
-translate_output="Translation output"
+translate_output="Output Destination"
 dtw_token_timestamps="DTW token timestamps"
 buffered_output="Buffered output (Experimental)"
 translate_model="Model"
@@ -87,3 +87,4 @@ translate_explaination="Enabling translation will increase the processing load o
 log_group="Logging"
 advanced_group="Advanced Configuration"
 buffered_output_parameters="Buffered Output Configuration"
+file_output_info="Note: Translation output will be saved to a file in the same directory with the target language added to the name, e.g. 'output_es.srt'."
diff --git a/src/transcription-filter-callbacks.cpp b/src/transcription-filter-callbacks.cpp
@@ -64,32 +64,38 @@ std::string send_sentence_to_translation(const std::string &sentence,
 				obs_log(LOG_INFO, "Translation: '%s' -> '%s'", sentence.c_str(),
 					translated_text.c_str());
 			}
-
-			send_timed_metadata_to_server(gf, NON_WHISPER_TRANSLATE, sentence,
-						      translated_text);
-
-			if (gf->translation_output == "none") {
-				// overwrite the original text with the translated text
-				return translated_text;
-			} else {
-				// send the translation to the selected source
-				send_caption_to_source(gf->translation_output, translated_text, gf);
-			}
+			return translated_text;
 		} else {
 			obs_log(gf->log_level, "Failed to translate text");
 		}
 	}
-	return sentence;
+	return "";
 }
 
 void send_sentence_to_file(struct transcription_filter_data *gf,
-			   const DetectionResultWithText &result, const std::string &str_copy)
+			   const DetectionResultWithText &result, const std::string &str_copy,
+			   const std::string &translated_sentence)
 {
 	// Check if we should save the sentence
 	if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
 		// We are not recording, do not save the sentence to file
 		return;
 	}
+
+	std::string translated_file_path = "";
+	bool write_translations = gf->translate && !translated_sentence.empty();
+
+	// if translation is enabled, save the translated sentence to another file
+	if (write_translations) {
+		// add a postfix to the file name (without extension) with the translation target language
+		std::string output_file_path = gf->output_file_path;
+		std::string file_extension =
+			output_file_path.substr(output_file_path.find_last_of(".") + 1);
+		std::string file_name =
+			output_file_path.substr(0, output_file_path.find_last_of("."));
+		translated_file_path = file_name + "_" + gf->target_lang + "." + file_extension;
+	}
+
 	// should the file be truncated?
 	std::ios_base::openmode openmode = std::ios::out;
 	if (gf->truncate_output_file) {
@@ -102,6 +108,11 @@ void send_sentence_to_file(struct transcription_filter_data *gf,
 		std::ofstream output_file(gf->output_file_path, openmode);
 		output_file << str_copy << std::endl;
 		output_file.close();
+		if (write_translations) {
+			std::ofstream translated_output_file(translated_file_path, openmode);
+			translated_output_file << translated_sentence << std::endl;
+			translated_output_file.close();
+		}
 	} else {
 		if (result.start_timestamp_ms == 0 && result.end_timestamp_ms == 0) {
 			// No timestamps, do not save the sentence to srt
@@ -114,27 +125,45 @@ void send_sentence_to_file(struct transcription_filter_data *gf,
 		std::ofstream output_file(gf->output_file_path, openmode);
 		output_file << gf->sentence_number << std::endl;
 		// use the start and end timestamps to calculate the start and end time in srt format
-		auto format_ts_for_srt = [&output_file](uint64_t ts) {
+		auto format_ts_for_srt = [](std::ofstream &output_stream, uint64_t ts) {
 			uint64_t time_s = ts / 1000;
 			uint64_t time_m = time_s / 60;
 			uint64_t time_h = time_m / 60;
 			uint64_t time_ms_rem = ts % 1000;
 			uint64_t time_s_rem = time_s % 60;
 			uint64_t time_m_rem = time_m % 60;
 			uint64_t time_h_rem = time_h % 60;
-			output_file << std::setfill('0') << std::setw(2) << time_h_rem << ":"
-				    << std::setfill('0') << std::setw(2) << time_m_rem << ":"
-				    << std::setfill('0') << std::setw(2) << time_s_rem << ","
-				    << std::setfill('0') << std::setw(3) << time_ms_rem;
+			output_stream << std::setfill('0') << std::setw(2) << time_h_rem << ":"
+				      << std::setfill('0') << std::setw(2) << time_m_rem << ":"
+				      << std::setfill('0') << std::setw(2) << time_s_rem << ","
+				      << std::setfill('0') << std::setw(3) << time_ms_rem;
 		};
-		format_ts_for_srt(result.start_timestamp_ms);
+		format_ts_for_srt(output_file, result.start_timestamp_ms);
 		output_file << " --> ";
-		format_ts_for_srt(result.end_timestamp_ms);
+		format_ts_for_srt(output_file, result.end_timestamp_ms);
 		output_file << std::endl;
 
 		output_file << str_copy << std::endl;
 		output_file << std::endl;
 		output_file.close();
+
+		if (write_translations) {
+			obs_log(gf->log_level, "Saving translation to file %s, sentence #%d",
+				translated_file_path.c_str(), gf->sentence_number);
+
+			// Append translated sentence to file in .srt format
+			std::ofstream translated_output_file(translated_file_path, openmode);
+			translated_output_file << gf->sentence_number << std::endl;
+			format_ts_for_srt(translated_output_file, result.start_timestamp_ms);
+			translated_output_file << " --> ";
+			format_ts_for_srt(translated_output_file, result.end_timestamp_ms);
+			translated_output_file << std::endl;
+
+			translated_output_file << translated_sentence << std::endl;
+			translated_output_file << std::endl;
+			translated_output_file.close();
+		}
+
 		gf->sentence_number++;
 	}
 }
@@ -190,13 +219,28 @@ void set_text_callback(struct transcription_filter_data *gf,
 		}
 	}
 
+	// send the sentence to translation (if enabled)
+	std::string translated_sentence = send_sentence_to_translation(str_copy, gf);
+
+	if (gf->translate) {
+		if (gf->translation_output == "none") {
+			// overwrite the original text with the translated text
+			str_copy = translated_sentence;
+		} else {
+			if (gf->buffered_output) {
+				gf->translation_monitor.addSentence(translated_sentence);
+			} else {
+				// non-buffered output - send the sentence to the selected source
+				send_caption_to_source(gf->translation_output, translated_sentence,
+						       gf);
+			}
+		}
+	}
+
 	if (gf->buffered_output) {
 		gf->captions_monitor.addSentence(str_copy);
 	} else {
-		// non-buffered output
-		// send the sentence to translation (if enabled)
-		str_copy = send_sentence_to_translation(str_copy, gf);
-		// send the sentence to the selected source
+		// non-buffered output - send the sentence to the selected source
 		send_caption_to_source(gf->text_source_name, str_copy, gf);
 
 		if (!gf->translate) {
@@ -209,7 +253,7 @@ void set_text_callback(struct transcription_filter_data *gf,
 	}
 
 	if (gf->save_to_file && gf->output_file_path != "") {
-		send_sentence_to_file(gf, result, str_copy);
+		send_sentence_to_file(gf, result, str_copy, translated_sentence);
 	}
 };
 

diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
@@ -94,6 +94,7 @@ struct transcription_filter_data {
 	// Output file path to write the subtitles
 	std::string output_file_path;
 	std::string whisper_model_file_currently_loaded;
+	bool whisper_model_loaded_new;
 
 	// Use std for thread and mutex
 	std::thread whisper_thread;
@@ -109,6 +110,7 @@ struct transcription_filter_data {
 
 	bool buffered_output = false;
 	TokenBufferThread captions_monitor;
+	TokenBufferThread translation_monitor;
 	int buffered_output_num_lines = 2;
 	int buffered_output_num_chars = 30;
 	TokenBufferSegmentation buffered_output_output_type =