locaal-ai · royshil · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/src/tests/localvocal-offline-test.cpp b/src/tests/localvocal-offline-test.cpp
@@ -46,6 +46,8 @@ void obs_log(int log_level, const char *format, ...)
 
 	auto diff = now - start;
 
+	static std::mutex log_mutex;
+	auto lock = std::lock_guard(log_mutex);
 	// print timestamp
 	printf("[%02d:%02d:%02d.%03d] [%02d:%02lld.%03lld] ", now_tm.tm_hour, now_tm.tm_min,
 	       now_tm.tm_sec, (int)(epoch.count() % 1000),
@@ -194,6 +196,11 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
 	return gf;
 }
 
+std::mutex json_segments_input_mutex;
+std::condition_variable json_segments_input_cv;
+std::vector<nlohmann::json> json_segments_input;
+bool json_segments_input_finished = false;
+
 void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm32f_data,
 			  size_t frames, int vad_state, const DetectionResultWithText &result)
 {
@@ -214,33 +221,56 @@ void audio_chunk_callback(struct transcription_filter_data *gf, const float *pcm
 	// obs_log(gf->log_level, "Saving %lu frames to %s", frames, filename.c_str());
 	// write_audio_wav_file(filename.c_str(), pcm32f_data, frames);
 
-	// append a row to the array in the segments.json file
-	std::string segments_filename = "segments.json";
-	nlohmann::json segments_json;
-
-	// Read existing segments from file
-	std::ifstream segments_file(segments_filename);
-	if (segments_file.is_open()) {
-		segments_file >> segments_json;
-		segments_file.close();
-	}
-
 	// Create a new segment object
 	nlohmann::json segment;
 	segment["start_time"] = result.start_timestamp_ms / 1000.0;
 	segment["end_time"] = result.end_timestamp_ms / 1000.0;
 	segment["segment_label"] = result.text;
 
-	// Add the new segment to the segments array
-	segments_json.push_back(segment);
+	{
+		auto lock = std::lock_guard(json_segments_input_mutex);
+
+		// Add the new segment to the segments array
+		json_segments_input.push_back(segment);
+	}
+	json_segments_input_cv.notify_one();
+}
+
+void json_segments_saver_thread_function()
+{
+	std::string segments_filename = "segments.json";
+	nlohmann::json segments_json;
+
+	decltype(json_segments_input) json_segments_input_local;
+
+	for (;;) {
+		{
+			auto lock = std::unique_lock(json_segments_input_mutex);
+			while (json_segments_input.empty()) {
+				if (json_segments_input_finished)
+					return;
+				json_segments_input_cv.wait(lock, [&] {
+					return json_segments_input_finished ||
+					       !json_segments_input.empty();
+				});
+			}
+
+			std::swap(json_segments_input, json_segments_input_local);
+			json_segments_input.clear();
+		}
+
+		for (auto &elem : json_segments_input_local) {
+			segments_json.push_back(std::move(elem));
+		}
 
-	// Write the updated segments back to the file
-	std::ofstream segments_file_out(segments_filename);
-	if (segments_file_out.is_open()) {
-		segments_file_out << std::setw(4) << segments_json << std::endl;
-		segments_file_out.close();
-	} else {
-		obs_log(gf->log_level, "Failed to open %s", segments_filename.c_str());
+		// Write the updated segments back to the file
+		std::ofstream segments_file_out(segments_filename);
+		if (segments_file_out.is_open()) {
+			segments_file_out << std::setw(4) << segments_json << std::endl;
+			segments_file_out.close();
+		} else {
+			obs_log(LOG_INFO, "Failed to open %s", segments_filename.c_str());
+		}
 	}
 }
 
@@ -361,6 +391,7 @@ int wmain(int argc, wchar_t *argv[])
 
 	std::cout << "LocalVocal Offline Test" << std::endl;
 	transcription_filter_data *gf = nullptr;
+	std::optional<std::thread> audio_chunk_saver_thread;
 
 	std::vector<std::vector<uint8_t>> audio =
 		read_audio_file(filenameStr.c_str(), [&](int sample_rate, int channels) {
@@ -419,6 +450,10 @@ int wmain(int argc, wchar_t *argv[])
 		return 1;
 	}
 
+	if (gf->enable_audio_chunks_callback) {
+		audio_chunk_saver_thread.emplace(json_segments_saver_thread_function);
+	}
+
 	// truncate the output file
 	obs_log(LOG_INFO, "Truncating output file");
 	std::ofstream output_file(gf->output_file_path, std::ios::trunc);
@@ -437,10 +472,10 @@ int wmain(int argc, wchar_t *argv[])
 
 		obs_log(LOG_INFO, "Sending samples to whisper buffer");
 		// 25 ms worth of frames
-		int frames = gf->sample_rate * window_size_in_ms.count() / 1000;
+		size_t frames = gf->sample_rate * window_size_in_ms.count() / 1000;
 		const int frame_size_bytes = sizeof(float);
-		int frames_size_bytes = frames * frame_size_bytes;
-		int frames_count = 0;
+		size_t frames_size_bytes = frames * frame_size_bytes;
+		size_t frames_count = 0;
 		int64_t start_time = std::chrono::duration_cast<std::chrono::nanoseconds>(
 					     std::chrono::system_clock::now().time_since_epoch())
 					     .count();
@@ -464,12 +499,13 @@ int wmain(int argc, wchar_t *argv[])
 						if (false && now > max_wait)
 							break;
 
+						if (gf->input_buffers->size == 0)
+							break;
+
 						gf->input_cv->wait_for(
-							lock, std::chrono::milliseconds(10), [&] {
+							lock, std::chrono::milliseconds(1), [&] {
 								return gf->input_buffers->size == 0;
 							});
-						if (gf->input_buffers->size == 0)
-							break;
 					}
 					// push back current audio data to input circlebuf
 					for (size_t c = 0; c < gf->channels; c++) {
@@ -533,6 +569,15 @@ int wmain(int argc, wchar_t *argv[])
 		}
 	}
 
+	if (audio_chunk_saver_thread.has_value()) {
+		{
+			auto lock = std::lock_guard(json_segments_input_mutex);
+			json_segments_input_finished = true;
+		}
+		json_segments_input_cv.notify_one();
+		audio_chunk_saver_thread->join();
+	}
+
 	release_context(gf);
 
 	obs_log(LOG_INFO, "LocalVocal Offline Test Done");