Allow to analyse by 10/20/30 milliseconds. Change the internals such …

…that this is passed on instead of the frame_length
bnosac · Jan 23, 2024 · 9ae3f26 · 9ae3f26
1 parent e39f620
commit 9ae3f26
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 37 deletions.
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,7 +1,7 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-vad_webrtc_detection <- function(file, mode = 3L, frame_length = 160L) {
-    .Call('_audio_vadwebrtc_vad_webrtc_detection', PACKAGE = 'audio.vadwebrtc', file, mode, frame_length)
+vad_webrtc_detection <- function(file, mode = 3L, milliseconds = 10L) {
+    .Call('_audio_vadwebrtc_vad_webrtc_detection', PACKAGE = 'audio.vadwebrtc', file, mode, milliseconds)
 }
 
diff --git a/R/vad.R b/R/vad.R
@@ -1,27 +1,37 @@
 #' @title Voice Activity Detection
 #' @description Detect the location of active Voice Activity Detection using a Gaussian Mixture Model implemented in the "webrtc" framework. 
 #' @param file the path to an audio file which should be a file in 16 bit with mono PCM samples (pcm_s16le codec) with a sampling rate of either 8Khz, 16KHz or 32Khz
-#' @param mode the type of voice detection, either 'normal', 'lowbitrate', 'aggressive' or 'veryaggressive'
+#' @param mode the type of voice detection, either 'normal', 'lowbitrate', 'aggressive' or 'veryaggressive' where 'veryaggressive' means more silences are detected
+#' @param milliseconds integer with the number of milliseconds indicating to compute by this number of milliseconds the VAD signal. Can only be 10, 20 or 30. Defaults to 10.
 #' @param type character string with the type of VAD model. Only 'webrtc' currently.
 #' @return a list with elements
 #' \itemize{
 #' \item{file: the path to the file}
-#' \item{sample_rate: the sample rate in Hz}
-#' \item{channels: the number of channels in the audio, as the algorithm requires mono audio this should only be 1}
+#' \item{sample_rate: the sample rate of the audio file in Hz}
+#' \item{channels: the number of channels in the audio - as the algorithm requires mono audio this should only be 1}
 #' \item{samples: the number of samples in the data}
-#' \item{format: the format - should}
-#' \item{type: the type of model - currently only 'webrtc-gmm'}
+#' \item{bitsPerSample: the number of bits per sample}
+#' \item{bytesPerSample: the number of bytes per sample}
+#' \item{type: the type of VAD model - currently only 'webrtc-gmm'}
 #' \item{mode: the provided VAD mode}
-#' \item{bitsPerSample}
-#' \item{bytesPerSample}
+#' \item{milliseconds: the provided milliseconds - either by 10, 20 or 30 ms frames}
+#' \item{frame_length: the frame length corresponding to the provided milliseconds}
 #' \item{vad: a data.frame with columns millisecond, has_voice and vad_segment indicating if the audio contains an active voice signal at that millisecond}
 #' \item{vad_segments: a data.frame with columns vad_segment, start, end and has_voice where the start/end values are in seconds}
 #' \item{vad_stats: a list with elements n_segments, n_segments_has_voice, n_segments_has_no_voice, seconds_has_voice, seconds_has_no_voice, pct_has_voice indicating the number of segments with voice and the duration of the voice/non-voice in the audio}
 #' }
 #' @export
 #' @examples 
 #' file <- system.file(package = "audio.vadwebrtc", "extdata", "test_wav.wav")
-#' vad  <- VAD(file, mode = "normal")
+#' vad  <- VAD(file, mode = "normal", milliseconds = 30)
+#' vad
+#' vad  <- VAD(file, mode = "lowbitrate", milliseconds = 20)
+#' vad
+#' vad  <- VAD(file, mode = "aggressive", milliseconds = 20)
+#' vad
+#' vad  <- VAD(file, mode = "veryaggressive", milliseconds = 20)
+#' vad
+#' vad  <- VAD(file, mode = "normal", milliseconds = 10)
 #' vad
 #' vad$vad_segments
 #' \dontrun{
@@ -41,17 +51,21 @@
 #' vad  <- VAD(file, mode = "normal")
 #' vad
 #' vad$vad_segments
-#' vad$vad_segments_info
-VAD <- function(file, mode = c("normal", "lowbitrate", "aggressive", "veryaggressive"), type = "webrtc"){
+#' vad$vad_stats
+VAD <- function(file, mode = c("normal", "lowbitrate", "aggressive", "veryaggressive"), milliseconds = 10L, type = "webrtc"){
     type <- match.arg(type)
     mode <- match.arg(mode)
     stopifnot(file.exists(file))
-    msg <- vad_webrtc_detection(file, mode = switch(mode,
-                                                    normal = 0,
-                                                    lowbitrate = 1,
-                                                    aggressive = 2,
-                                                    veryaggressive = 3))
-    ## Get groups
+    milliseconds <- as.integer(milliseconds)
+    stopifnot(milliseconds %in% c(10, 20, 30))
+    msg <- vad_webrtc_detection(file, 
+                                mode = switch(mode,
+                                              normal = 0,
+                                              lowbitrate = 1,
+                                              aggressive = 2,
+                                              veryaggressive = 3),
+                                milliseconds = milliseconds)
+    ## Get groups of sequences of voice/non-voice
     grp <- rle(msg$vad$has_voice)
     msg$type <- "webrtc-gmm"
     msg$mode <- mode
@@ -62,6 +76,7 @@ VAD <- function(file, mode = c("normal", "lowbitrate", "aggressive", "veryaggres
                            end = vapply(segments, FUN = function(x) x[2], FUN.VALUE = integer(1), USE.NAMES = FALSE) / 1000,
                            has_voice = grp$values)
     msg$vad_segments <- segments
+    ## Calculate the percentage of voiced signal
     vad_segments_info <- list(
         n_segments = nrow(segments), 
         n_segments_has_voice = sum(segments$has_voice, na.rm = TRUE), 
@@ -79,7 +94,7 @@ print.VAD <- function(x, ...){
     cat("Voice Activity Detection", "\n")
     cat("  - file:", x$file, "\n")
     cat("  - sample rate:", x$sample_rate, "\n")
-    cat("  - VAD type: ", x$type, ", VAD mode: ", x$mode, "\n", sep = "")
+    cat("  - VAD type: ", x$type, ", VAD mode: ", x$mode, ", VAD by milliseconds: ", x$milliseconds, ", VAD frame_length: ", x$frame_length, "\n", sep = "")
     cat("    - Percent of audio containing a voiced signal:", paste(round(100*x$vad_stats$pct_has_voice, digits = 1), "%", sep = ""), "\n")
     cat("    - Seconds voiced:", round(x$vad_stats$seconds_has_voice, digits = 1), "\n")
     cat("    - Seconds unvoiced:", round(x$vad_stats$seconds_has_no_voice, digits = 1), "\n")

diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ vad
 Voice Activity Detection 
   - file: D:/Jan/R/win-library/4.1/audio.vadwebrtc/extdata/test_wav.wav 
   - sample rate: 16000 
-  - VAD type: webrtc-gmm, VAD mode: normal
+  - VAD type: webrtc-gmm, VAD mode: normal, VAD by milliseconds: 10, VAD frame_length: 160
     - Percent of audio containing a voiced signal: 90.2% 
     - Seconds voiced: 6.3 
     - Seconds unvoiced: 0.7

diff --git a/man/VAD.Rd b/man/VAD.Rd
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -11,15 +11,15 @@ Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 #endif
 
 // vad_webrtc_detection
-Rcpp::List vad_webrtc_detection(std::string file, int mode, size_t frame_length);
-RcppExport SEXP _audio_vadwebrtc_vad_webrtc_detection(SEXP fileSEXP, SEXP modeSEXP, SEXP frame_lengthSEXP) {
+Rcpp::List vad_webrtc_detection(std::string file, int mode, int milliseconds);
+RcppExport SEXP _audio_vadwebrtc_vad_webrtc_detection(SEXP fileSEXP, SEXP modeSEXP, SEXP millisecondsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< std::string >::type file(fileSEXP);
     Rcpp::traits::input_parameter< int >::type mode(modeSEXP);
-    Rcpp::traits::input_parameter< size_t >::type frame_length(frame_lengthSEXP);
-    rcpp_result_gen = Rcpp::wrap(vad_webrtc_detection(file, mode, frame_length));
+    Rcpp::traits::input_parameter< int >::type milliseconds(millisecondsSEXP);
+    rcpp_result_gen = Rcpp::wrap(vad_webrtc_detection(file, mode, milliseconds));
     return rcpp_result_gen;
 END_RCPP
 }

diff --git a/src/rcpp_webrtc.cpp b/src/rcpp_webrtc.cpp
@@ -62,7 +62,7 @@ VadInst *vad_init(VadInst *vadptr){
 }
 
 // [[Rcpp::export]]
-Rcpp::List vad_webrtc_detection(std::string file, int mode = 3, size_t frame_length = 160) {
+Rcpp::List vad_webrtc_detection(std::string file, int mode = 3, int milliseconds = 10) {
     VadInst* vad = vad_create();
     vad = vad_init(vad);
     if (vad) {
@@ -76,16 +76,19 @@ Rcpp::List vad_webrtc_detection(std::string file, int mode = 3, size_t frame_len
     // Check on the combination of the frame length and the sample_rate
     Rcpp::LogicalVector detections;
     Rcpp::IntegerVector detections_ms;
+
+    size_t frame_length = 160;
+    frame_length = (size_t)((Rcpp::as<int>(output["sample_rate"]) / 1000) * milliseconds);
     int ok = WebRtcVad_ValidRateAndFrameLength(output["sample_rate"], frame_length);
     if (ok < 0) {
         Rcpp::stop("Invalid combination of Hz and frame_length. We support 10, 20 and 30 ms frames and the rates 8000, 16000 and 32000 Hz.");
     }
     //int step = 160;
     const int16_t * temp = pcm16.data();
     // currently checking in 10ms frames, most likely to change
-    for(unsigned int i=0, ms=0; i < pcm16.size(); i+=160, ms+=10){
-        int isActive = WebRtcVad_Process(vad, output["sample_rate"], temp, frame_length); // 1 = voice , 0 = not voice
-        temp = temp + 160;
+    for(unsigned int i=0, ms=0; i < pcm16.size(); i+=frame_length, ms+=milliseconds){
+        int isActive = WebRtcVad_Process(vad, output["sample_rate"], temp, frame_length); // 1 = voice , 0 = not voice, -1 = error
+        temp = temp + frame_length;
         if(isActive < 0){
             detections.push_back(NA_LOGICAL);
         }else{
@@ -94,6 +97,8 @@ Rcpp::List vad_webrtc_detection(std::string file, int mode = 3, size_t frame_len
         detections_ms.push_back(ms);
     }
     vad_free(vad);
+    output["milliseconds"] = milliseconds;
+    output["frame_length"] = frame_length;
     output["vad"] = Rcpp::DataFrame::create(
         Rcpp::Named("millisecond") = detections_ms, 
         Rcpp::Named("has_voice") = detections);