diff --git a/buildspec.json b/buildspec.json index 7dd0fa6..c7455fe 100644 --- a/buildspec.json +++ b/buildspec.json @@ -45,7 +45,7 @@ } }, "name": "obs-localvocal", - "version": "0.0.6", + "version": "0.0.7", "author": "Roy Shilkrot", "website": "https://github.com/obs-ai/obs-localvocal", "email": "roy.shil@gmail.com", diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index d2b2d44..4eaefd8 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -167,7 +167,7 @@ void transcription_filter_destroy(void *data) delete gf->wshiper_thread_cv; delete gf->text_source_mutex; - bfree(gf); + delete gf; } void acquire_weak_text_source_ref(struct transcription_filter_data *gf) diff --git a/src/whisper-processing.cpp b/src/whisper-processing.cpp index 844aff5..75be45c 100644 --- a/src/whisper-processing.cpp +++ b/src/whisper-processing.cpp @@ -8,6 +8,7 @@ #include #include +#include #ifdef _WIN32 #include @@ -45,6 +46,37 @@ void high_pass_filter(float *pcmf32, size_t pcm32f_size, float cutoff, uint32_t } } +float calculate_segment_energy(const float *pcmf32, size_t pcm32f_size) +{ + float energy = 0.0f; + for (size_t i = 0; i < pcm32f_size; i++) { + energy += fabsf(pcmf32[i]); + } + return energy / (float)pcm32f_size; +} + +size_t find_tail_word_cutoff(const float *pcmf32, size_t pcm32f_size, uint32_t sample_rate_hz) +{ + // segment size: 10ms worth of samples + const size_t segment_size = 10 * sample_rate_hz / 1000; + // overlap size in samples + const size_t overlap_size = OVERLAP_SIZE_MSEC * sample_rate_hz / 1000; + // tail lookup window starting point + const size_t tail_lookup_start = pcm32f_size - overlap_size; + + size_t tail_word_cutoff = pcm32f_size; + float lowest_energy = FLT_MAX; + for (size_t i = tail_lookup_start; i < pcm32f_size - segment_size; i += segment_size / 2) { + const float energy = calculate_segment_energy(pcmf32 + i, segment_size); + if (energy < 0.0001 && energy < lowest_energy) { + tail_word_cutoff = i; + lowest_energy = energy; + } + } + + return tail_word_cutoff; +} + // VAD (voice activity detection), return true if speech detected bool vad_simple(float *pcmf32, size_t pcm32f_size, uint32_t sample_rate, float vad_thold, float freq_thold, bool verbose) @@ -278,9 +310,16 @@ void process_audio_from_buffer(struct transcription_filter_data *gf) } if (!skipped_inference) { + // find the tail word cutoff + const size_t tail_word_cutoff = + find_tail_word_cutoff(output[0], out_frames, WHISPER_SAMPLE_RATE); + if (tail_word_cutoff < out_frames) + obs_log(gf->log_level, "tail word cutoff: %d frames", + (int)tail_word_cutoff); + // run inference const struct DetectionResultWithText inference_result = - run_whisper_inference(gf, output[0], out_frames); + run_whisper_inference(gf, output[0], tail_word_cutoff); if (inference_result.result == DETECTION_RESULT_SPEECH) { // output inference result to a text source