locaal-ai · royshil · Oct 12, 2023 · Oct 7, 2023 · Oct 7, 2023 · Oct 12, 2023
diff --git a/README.md b/README.md
@@ -57,6 +57,12 @@ The plugin was built and tested on Mac OSX  (Intel & Apple silicon), Windows and
 
 Start by cloning this repo to a directory of your choice.
 
+Remember to sync and fetch the submodules before building, e.g.
+```sh
+$ git submodule sync --recursive
+$ git update --init --recursive
+```
+
 ### Mac OSX
 
 Using the CI pipeline scripts, locally you would just call the zsh script. By default this builds a universal binary for both Intel and Apple Silicon. To build for a specific architecture please see `.github/scripts/.build.zsh` for the `-arch` options.

diff --git a/cmake/linux/defaults.cmake b/cmake/linux/defaults.cmake
@@ -32,7 +32,7 @@ set(CPACK_SOURCE_IGNORE_FILES
     # cmake-format: sortable
     ".*~$"
     \\.git/
-    \\.github/
+    # \\.github/
     \\.gitignore
     build_.*
     cmake/\\.CMakeBuildNumber

diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
@@ -36,4 +36,7 @@ suppress_blank="Suppress blank"
 suppress_non_speech_tokens="Suppress non-speech tokens"
 temperature="Temperature"
 max_initial_ts="Max initial timestamps"
-length_penalty="Length penalty"
+length_penalty="Length penalty"
+save_srt="Save in SRT format (no file truncation)"
+only_while_recording="Write output only while recording"
+process_while_muted="Process speech while source is muted"
diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h
@@ -19,8 +19,21 @@
 
 #define MT_ obs_module_text
 
+enum DetectionResult {
+	DETECTION_RESULT_UNKNOWN = 0,
+	DETECTION_RESULT_SILENCE = 1,
+	DETECTION_RESULT_SPEECH = 2,
+};
+
+struct DetectionResultWithText {
+	DetectionResult result;
+	std::string text;
+	uint64_t start_timestamp_ms;
+	uint64_t end_timestamp_ms;
+};
+
 struct transcription_filter_data {
-	obs_source_t *context; // obs input source
+	obs_source_t *context; // obs filter source (this filter)
 	size_t channels;       // number of channels
 	uint32_t sample_rate;  // input sample rate
 	// How many input frames (in input sample rate) are needed for the next whisper frame
@@ -32,6 +45,10 @@ struct transcription_filter_data {
 	size_t last_num_frames;
 	// Milliseconds per processing step (e.g. rest of the whisper buffer may be filled with silence)
 	size_t step_size_msec;
+	// Start begining timestamp in ms since epoch
+	uint64_t start_timestamp_ms;
+	// Sentence counter for srt
+	size_t sentence_number;
 
 	/* PCM buffers */
 	float *copy_buffers[MAX_PREPROC_CHANNELS];
@@ -54,13 +71,16 @@ struct transcription_filter_data {
 	bool log_words;
 	bool caption_to_stream;
 	bool active = false;
+	bool save_srt = false;
+	bool save_only_while_recording = false;
+	bool process_while_muted = false;
 
 	// Text source to output the subtitles
 	obs_weak_source_t *text_source = nullptr;
 	char *text_source_name = nullptr;
 	std::mutex *text_source_mutex = nullptr;
 	// Callback to set the text in the output text source (subtitles)
-	std::function<void(const std::string &str)> setTextCallback;
+	std::function<void(const DetectionResultWithText &result)> setTextCallback;
 	// Output file path to write the subtitles
 	std::string output_file_path = "";
 	std::string whisper_model_file_currently_loaded = "";
@@ -79,6 +99,6 @@ struct transcription_filter_audio_info {
 	uint64_t timestamp;
 };
 
-void set_text_callback(struct transcription_filter_data *gf, const std::string &str);
+void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &str);
 
 #endif /* TRANSCRIPTION_FILTER_DATA_H */
diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp
@@ -41,6 +41,13 @@ inline enum speaker_layout convert_speaker_layout(uint8_t channels)
 	}
 }
 
+inline uint64_t now_ms()
+{
+	return std::chrono::duration_cast<std::chrono::milliseconds>(
+		       std::chrono::system_clock::now().time_since_epoch())
+		.count();
+}
+
 bool add_sources_to_list(void *list_property, obs_source_t *source)
 {
 	auto source_id = obs_source_get_id(source);
@@ -71,6 +78,13 @@ struct obs_audio_data *transcription_filter_filter_audio(void *data, struct obs_
 		return audio;
 	}
 
+	// Check if the parent source is muted
+	obs_source_t *parent_source = obs_filter_get_parent(gf->context);
+	if (gf->process_while_muted == false && obs_source_muted(parent_source)) {
+		// Source is muted, do not process audio
+		return audio;
+	}
+
 	if (gf->whisper_context == nullptr) {
 		// Whisper not initialized, just pass through
 		return audio;
@@ -179,35 +193,75 @@ void acquire_weak_text_source_ref(struct transcription_filter_data *gf)
 	}
 }
 
-void set_text_callback(struct transcription_filter_data *gf, const std::string &str)
+void set_text_callback(struct transcription_filter_data *gf, const DetectionResultWithText &result)
 {
 #ifdef _WIN32
 	// Russian UTF8 charset on Windows output has a bug, instead of 0xd? it outputs
 	// 0xf?, so we need to replace it. This doesn't affect any other charset, which
 	// outputs the correct UTF8 output. (Except maybe for Greek?)
-	std::string str_copy = str;
+	std::string str_copy = result.text;
 	for (size_t i = 0; i < str_copy.size(); ++i) {
 		// if the char MSBs starts with 0xf replace the MSBs with 0xd
 		if ((str_copy.c_str()[i] & 0xf0) == 0xf0) {
 			str_copy[i] = (str_copy.c_str()[i] & 0x0f) | 0xd0;
 		}
 	}
 #else
-	std::string str_copy = str;
+	std::string str_copy = result.text;
 #endif
 
 	if (gf->caption_to_stream) {
 		obs_output_t *streaming_output = obs_frontend_get_streaming_output();
 		if (streaming_output) {
-			obs_output_output_caption_text1(streaming_output, str.c_str());
+			obs_output_output_caption_text1(streaming_output, result.text.c_str());
 			obs_output_release(streaming_output);
 		}
 	}
+
 	if (gf->output_file_path != "" && !gf->text_source_name) {
-		// Write to file, do not append
-		std::ofstream output_file(gf->output_file_path, std::ios::out | std::ios::trunc);
-		output_file << str;
-		output_file.close();
+		// Check if we should save the sentence
+		if (gf->save_only_while_recording && !obs_frontend_recording_active()) {
+			// We are not recording, do not save the sentence to file
+			return;
+		}
+		if (!gf->save_srt) {
+			// Write raw sentence to file, do not append
+			std::ofstream output_file(gf->output_file_path,
+						  std::ios::out | std::ios::trunc);
+			output_file << result.text << std::endl;
+			output_file.close();
+		} else {
+			obs_log(gf->log_level, "Saving sentence to file %s, sentence #%d",
+				gf->output_file_path.c_str(), gf->sentence_number);
+			// Append sentence to file in .srt format
+			std::ofstream output_file(gf->output_file_path,
+						  std::ios::out | std::ios::app);
+			output_file << gf->sentence_number << std::endl;
+			// use the start and end timestamps to calculate the start and end time in srt format
+			auto format_ts_for_srt = [&output_file](uint64_t ts) {
+				uint64_t time_s = ts / 1000;
+				uint64_t time_m = time_s / 60;
+				uint64_t time_h = time_m / 60;
+				uint64_t time_ms_rem = ts % 1000;
+				uint64_t time_s_rem = time_s % 60;
+				uint64_t time_m_rem = time_m % 60;
+				uint64_t time_h_rem = time_h % 60;
+				output_file << std::setfill('0') << std::setw(2) << time_h_rem
+					    << ":" << std::setfill('0') << std::setw(2)
+					    << time_m_rem << ":" << std::setfill('0')
+					    << std::setw(2) << time_s_rem << ","
+					    << std::setfill('0') << std::setw(3) << time_ms_rem;
+			};
+			format_ts_for_srt(result.start_timestamp_ms);
+			output_file << " --> ";
+			format_ts_for_srt(result.end_timestamp_ms);
+			output_file << std::endl;
+
+			output_file << result.text << std::endl;
+			output_file << std::endl;
+			output_file.close();
+			gf->sentence_number++;
+		}
 	} else {
 		if (!gf->text_source_mutex) {
 			obs_log(LOG_ERROR, "text_source_mutex is null");
@@ -292,6 +346,12 @@ void transcription_filter_update(void *data, obs_data_t *s)
 	bool step_by_step_processing = obs_data_get_bool(s, "step_by_step_processing");
 	gf->step_size_msec = step_by_step_processing ? (int)obs_data_get_int(s, "step_size_msec")
 						     : BUFFER_SIZE_MSEC;
+	gf->save_srt = obs_data_get_bool(s, "subtitle_save_srt");
+	gf->save_only_while_recording = obs_data_get_bool(s, "only_while_recording");
+	// Get the current timestamp using the system clock
+	gf->start_timestamp_ms = now_ms();
+	gf->sentence_number = 1;
+	gf->process_while_muted = obs_data_get_bool(s, "process_while_muted");
 
 	obs_log(gf->log_level, "transcription_filter: update text source");
 	// update the text source
@@ -468,6 +528,9 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 				     ? (int)obs_data_get_int(settings, "step_size_msec")
 				     : BUFFER_SIZE_MSEC;
 	gf->log_level = (int)obs_data_get_int(settings, "log_level");
+	gf->save_srt = obs_data_get_bool(settings, "subtitle_save_srt");
+	gf->save_only_while_recording = obs_data_get_bool(settings, "only_while_recording");
+	gf->process_while_muted = obs_data_get_bool(settings, "process_while_muted");
 
 	for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
 		circlebuf_init(&gf->input_buffers[i]);
@@ -525,6 +588,28 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter)
 
 	gf->active = true;
 
+	// handle the event OBS_FRONTEND_EVENT_RECORDING_STARTING to reset the srt sentence number
+	// to match the subtitles with the recording
+	obs_frontend_add_event_callback(
+		[](enum obs_frontend_event event, void *private_data) {
+			if (event == OBS_FRONTEND_EVENT_RECORDING_STARTING) {
+				struct transcription_filter_data *gf_ =
+					static_cast<struct transcription_filter_data *>(
+						private_data);
+				if (gf_->save_srt && gf_->save_only_while_recording) {
+					obs_log(gf_->log_level,
+						"Recording started. Resetting srt file.");
+					// truncate file if it exists
+					std::ofstream output_file(gf_->output_file_path,
+								  std::ios::out | std::ios::trunc);
+					output_file.close();
+					gf_->sentence_number = 1;
+					gf_->start_timestamp_ms = now_ms();
+				}
+			}
+		},
+		gf);
+
 	obs_log(gf->log_level, "transcription_filter: filter created.");
 	return gf;
 }
@@ -557,6 +642,9 @@ void transcription_filter_defaults(obs_data_t *s)
 	obs_data_set_default_string(s, "whisper_language_select", "en");
 	obs_data_set_default_string(s, "subtitle_sources", "none");
 	obs_data_set_default_bool(s, "step_by_step_processing", false);
+	obs_data_set_default_bool(s, "process_while_muted", false);
+	obs_data_set_default_bool(s, "subtitle_save_srt", false);
+	obs_data_set_default_bool(s, "only_while_recording", false);
 	obs_data_set_default_int(s, "step_size_msec", 1000);
 
 	// Whisper parameters
@@ -617,6 +705,7 @@ obs_properties_t *transcription_filter_properties(void *data)
 		return true;
 	});
 
+	obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted"));
 	obs_property_t *subs_output =
 		obs_properties_add_list(ppts, "subtitle_sources", MT_("subtitle_sources"),
 					OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING);
@@ -628,21 +717,21 @@ obs_properties_t *transcription_filter_properties(void *data)
 
 	obs_properties_add_path(ppts, "subtitle_output_filename", MT_("output_filename"),
 				OBS_PATH_FILE_SAVE, "Text (*.txt)", NULL);
+	obs_properties_add_bool(ppts, "subtitle_save_srt", MT_("save_srt"));
+	obs_properties_add_bool(ppts, "only_while_recording", MT_("only_while_recording"));
 
 	obs_property_set_modified_callback(subs_output, [](obs_properties_t *props,
 							   obs_property_t *property,
 							   obs_data_t *settings) {
 		UNUSED_PARAMETER(property);
+		// Show or hide the output filename selection input
 		const char *new_output = obs_data_get_string(settings, "subtitle_sources");
-		if (strcmp(new_output, "text_file") == 0) {
-			// Show the output filename selection input
-			obs_property_set_visible(
-				obs_properties_get(props, "subtitle_output_filename"), true);
-		} else {
-			// Hide the output filename selection input
-			obs_property_set_visible(
-				obs_properties_get(props, "subtitle_output_filename"), false);
-		}
+		const bool show_hide = (strcmp(new_output, "text_file") == 0);
+		obs_property_set_visible(obs_properties_get(props, "subtitle_output_filename"),
+					 show_hide);
+		obs_property_set_visible(obs_properties_get(props, "subtitle_save_srt"), show_hide);
+		obs_property_set_visible(obs_properties_get(props, "only_while_recording"),
+					 show_hide);
 		return true;
 	});