Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Offline test improvements #150

Merged
merged 9 commits into from
Aug 9, 2024
3 changes: 2 additions & 1 deletion src/tests/audio-file-utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ read_audio_file(const char *filename, std::function<void(int, int)> initializati
for (int j = 0; j < codecContext->channels; j++) {
buffer[j].insert(buffer[j].end(), frame->data[j],
frame->data[j] +
frame->linesize[0]);
frame->nb_samples *
sizeof(float));
}
}
}
Expand Down
28 changes: 15 additions & 13 deletions src/tests/copy_dlls.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,23 @@ $obsDlls = @(
".\release\Release\obs-plugins\64bit\onnxruntime_providers_shared.dll",
".\release\Release\obs-plugins\64bit\onnxruntime.dll",
".\release\Release\obs-plugins\64bit\whisper.dll",
".deps\obs-deps-2023-11-03-x64\bin\avcodec-60.dll",
".deps\obs-deps-2023-11-03-x64\bin\avdevice-60.dll",
".deps\obs-deps-2023-11-03-x64\bin\avfilter-9.dll",
".deps\obs-deps-2023-11-03-x64\bin\avformat-60.dll",
".deps\obs-deps-2023-11-03-x64\bin\avutil-58.dll",
".deps\obs-deps-2023-11-03-x64\bin\libx264-164.dll",
".deps\obs-deps-2023-11-03-x64\bin\swresample-4.dll",
".deps\obs-deps-2023-11-03-x64\bin\swscale-7.dll",
".deps\obs-deps-2023-11-03-x64\bin\zlib.dll"
".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
".deps\obs-studio-30.0.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
".\release\Release\obs-plugins\64bit\ggml.dll",
".deps\obs-deps-2024-03-19-x64\bin\avcodec-60.dll",
".deps\obs-deps-2024-03-19-x64\bin\avdevice-60.dll",
".deps\obs-deps-2024-03-19-x64\bin\avfilter-9.dll",
".deps\obs-deps-2024-03-19-x64\bin\avformat-60.dll",
".deps\obs-deps-2024-03-19-x64\bin\avutil-58.dll",
".deps\obs-deps-2024-03-19-x64\bin\libx264-164.dll",
".deps\obs-deps-2024-03-19-x64\bin\swresample-4.dll",
".deps\obs-deps-2024-03-19-x64\bin\swscale-7.dll",
".deps\obs-deps-2024-03-19-x64\bin\zlib.dll"
".deps\obs-deps-2024-03-19-x64\bin\librist.dll"
".deps\obs-deps-2024-03-19-x64\bin\srt.dll"
".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs-frontend-api.dll",
".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\obs.dll",
".deps\obs-studio-30.1.2\build_x64\rundir\Debug\bin\64bit\w32-pthreads.dll"
)

$obsDlls | ForEach-Object {
Copy-Item -Force -Path $_ -Destination $testToolPath
}

74 changes: 52 additions & 22 deletions src/tests/localvocal-offline-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

void obs_log(int log_level, const char *format, ...)
{
static auto start = std::chrono::system_clock::now();
if (log_level == LOG_DEBUG) {
return;
}
Expand All @@ -43,9 +44,14 @@ void obs_log(int log_level, const char *format, ...)
std::time_t now_time_t = std::chrono::system_clock::to_time_t(now);
std::tm now_tm = *std::localtime(&now_time_t);
Comment on lines 44 to 45
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

think we don't need this anymore right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kind of like having both timestamps available, local time to orient myself on when a particular run happened and "running time" to compare relative timing within a run


auto diff = now - start;

// print timestamp
printf("[%02d:%02d:%02d.%03d] ", now_tm.tm_hour, now_tm.tm_min, now_tm.tm_sec,
(int)(epoch.count() % 1000));
printf("[%02d:%02d:%02d.%03d] [%02d:%02lld.%03lld] ", now_tm.tm_hour, now_tm.tm_min,
now_tm.tm_sec, (int)(epoch.count() % 1000),
std::chrono::duration_cast<std::chrono::minutes>(diff).count(),
std::chrono::duration_cast<std::chrono::seconds>(diff).count() % 60,
std::chrono::duration_cast<std::chrono::milliseconds>(diff).count() % 1000);

// print log level
switch (log_level) {
Expand Down Expand Up @@ -95,12 +101,14 @@ create_context(int sample_rate, int channels, const std::string &whisper_model_p
gf->process_while_muted = false;
gf->buffered_output = false;
gf->fix_utf8 = true;
gf->input_cv.emplace();

for (size_t i = 0; i < gf->channels; i++) {
circlebuf_init(&gf->input_buffers[i]);
}
circlebuf_init(&gf->info_buffer);
circlebuf_init(&gf->whisper_buffer);
circlebuf_init(&gf->resampled_buffer);

// allocate copy buffers
gf->copy_buffers[0] =
Expand Down Expand Up @@ -307,6 +315,7 @@ void release_context(transcription_filter_data *gf)
}
circlebuf_free(&gf->info_buffer);
circlebuf_free(&gf->whisper_buffer);
circlebuf_free(&gf->resampled_buffer);

delete gf;
}
Expand Down Expand Up @@ -420,19 +429,23 @@ int wmain(int argc, wchar_t *argv[])
std::remove("segments.json");
}

const auto window_size_in_ms = std::chrono::milliseconds(25);

// fill up the whisper buffer
{
gf->start_timestamp_ms = now_ms();

obs_log(LOG_INFO, "Sending samples to whisper buffer");
// 25 ms worth of frames
int frames = gf->sample_rate * 25 / 1000;
int frames = gf->sample_rate * window_size_in_ms.count() / 1000;
const int frame_size_bytes = sizeof(float);
int frames_size_bytes = frames * frame_size_bytes;
int frames_count = 0;
int64_t start_time = std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
auto start_time_time = std::chrono::system_clock::now();
uint64_t window_number = 0;
while (true) {
// check if there are enough frames left in the audio buffer
if ((frames_count + frames) > (audio[0].size() / frame_size_bytes)) {
Expand All @@ -441,31 +454,48 @@ int wmain(int argc, wchar_t *argv[])
frames_size_bytes = frames * frame_size_bytes;
}
{
std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);

// push back current audio data to input circlebuf
for (size_t c = 0; c < gf->channels; c++) {
circlebuf_push_back(&gf->input_buffers[c],
audio[c].data() +
frames_count * frame_size_bytes,
frames_size_bytes);
{
auto max_wait = start_time_time +
(window_number * window_size_in_ms);
std::unique_lock<std::mutex> lock(gf->whisper_buf_mutex);
for (;;) {
// sleep up to window size in case whisper is processing, so the buffer builds up similar to OBS
auto now = std::chrono::system_clock::now();
if (false && now > max_wait)
break;

gf->input_cv->wait_for(
lock, std::chrono::milliseconds(10), [&] {
return gf->input_buffers->size == 0;
});
if (gf->input_buffers->size == 0)
break;
}
// push back current audio data to input circlebuf
for (size_t c = 0; c < gf->channels; c++) {
circlebuf_push_back(
&gf->input_buffers[c],
audio[c].data() +
frames_count * frame_size_bytes,
frames_size_bytes);
}
// push audio packet info (timestamp/frame count) to info circlebuf
struct transcription_filter_audio_info info = {0};
info.frames = frames; // number of frames in this packet
// make a timestamp from the current position in the audio buffer
info.timestamp_offset_ns =
start_time + (int64_t)(((float)frames_count /
(float)gf->sample_rate) *
1e9);
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
}
// push audio packet info (timestamp/frame count) to info circlebuf
struct transcription_filter_audio_info info = {0};
info.frames = frames; // number of frames in this packet
// make a timestamp from the current position in the audio buffer
info.timestamp_offset_ns =
start_time +
(int64_t)(((float)frames_count / (float)gf->sample_rate) *
1e9);
circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
gf->wshiper_thread_cv.notify_one();
}
frames_count += frames;
window_number += 1;
if (frames_count >= audio[0].size() / frame_size_bytes) {
break;
}
// sleep for 25 ms
std::this_thread::sleep_for(std::chrono::milliseconds(25));
}
// push a second of silence to the input circlebuf
frames = 2 * gf->sample_rate;
Expand Down
1 change: 1 addition & 0 deletions src/transcription-filter-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ struct transcription_filter_data {
std::mutex whisper_buf_mutex;
std::mutex whisper_ctx_mutex;
std::condition_variable wshiper_thread_cv;
std::optional<std::condition_variable> input_cv;

// translation context
struct translation_context translation_ctx;
Expand Down
15 changes: 10 additions & 5 deletions src/whisper-utils/whisper-processing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,11 +305,11 @@ void run_inference_and_callbacks(transcription_filter_data *gf, uint64_t start_o
float *pcm32f_data = (float *)bzalloc(pcm32f_size_with_silence * sizeof(float));
if (vad_state == VAD_STATE_PARTIAL) {
// peek instead of pop, since this is a partial run that keeps the data in the buffer
circlebuf_peek_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
pcm32f_size * sizeof(float));
circlebuf_peek_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
pcm32f_size * sizeof(float));
} else {
circlebuf_pop_back(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
pcm32f_size * sizeof(float));
circlebuf_pop_front(&gf->whisper_buffer, pcm32f_data + WHISPER_SAMPLE_RATE / 100,
pcm32f_size * sizeof(float));
}

struct DetectionResultWithText inference_result =
Expand Down Expand Up @@ -599,11 +599,16 @@ void whisper_loop(void *data)
}
}

if (gf->input_cv.has_value())
gf->input_cv->notify_one();

// Sleep using the condition variable wshiper_thread_cv
// This will wake up the thread if there is new data in the input buffer
// or if the whisper context is null
std::unique_lock<std::mutex> lock(gf->whisper_ctx_mutex);
gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
if (gf->input_buffers->size == 0) {
gf->wshiper_thread_cv.wait_for(lock, std::chrono::milliseconds(50));
}
}

obs_log(gf->log_level, "Exiting whisper thread");
Expand Down
Loading