add is.voiced to collapse voiced/non-voiced segments which are too short

bnosac · Feb 6, 2024 · 8ce252e · 8ce252e
1 parent 005faed
commit 8ce252e
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 11 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,7 +17,8 @@ Encoding: UTF-8
 Depends:
     R (>= 2.10)
 Imports:
-    Rcpp (>= 0.11.5)
+    Rcpp (>= 0.11.5),
+    utils
 Suggests:
     av
 LinkingTo: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,9 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(is.voiced,"webrtc-gmm")
 S3method(print,VAD)
 export(VAD)
+export(is.voiced)
 importFrom(Rcpp,evalCpp)
+importFrom(utils,head)
 useDynLib(audio.vadwebrtc)
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
 ## CHANGES IN audio.vadwebrtc VERSION 0.1
 
 - Added function VAD to detect voice in audio
+- Added is.voiced generic and is.voiced.webrtc-gmm to collapse voiced/non-voiced segments which are too short
 - Initial version based on webrtc commit 7f457533a2ee582865f50210e7460af90f78f0b6 (Wed Jan 17 19:03:20 2024)
 
 
diff --git a/R/pkg.R b/R/pkg.R
@@ -1,3 +1,4 @@
 #' @importFrom Rcpp evalCpp
+#' @importFrom utils head
 #' @useDynLib audio.vadwebrtc
 NULL
diff --git a/R/vad.R b/R/vad.R
@@ -110,12 +110,66 @@ print.VAD <- function(x, ...){
 
 
 
-# collapse_segments <- function(x, collapse_silence_secs = 1){
-#     x$has_voice <- ifelse(x$has_voice, x$has_voice, ifelse((x$end - x$start) < collapse_silence_secs, TRUE, x$has_voice))
-#     grp <- rle(x$has_voice)
-#     x$vad_segment <- rep(seq_along(grp$lengths), grp$lengths)
-#     x <- data.table::as.data.table(x)
-#     x <- x[, list(start = min(start), end = max(end)), by = list(vad_segment, has_voice)]
-#     x
-# }
-#voiced <- collapse_segments(vad$vad_segments)
+#' @title Get from a Voice Activity Detection (VAD object) the segments which are voiced
+#' @description Postprocessing the Voice Activity Detection whereby sequences of 
+#' voiced/non-voiced segments are collapsed by 
+#' \enumerate{
+#' \item{first considering all non-voiced segments which are small in duration (default < 1 second) voiced}
+#' \item{next considering voiced segments with length less than a number of seconds (default < 1 second) non-voiced}
+#' }
+#' @param x an object of class VAD
+#' @param units character string with the units to use - either 'seconds' or 'milliseconds'  
+#' @param ... further arguments passed on to the function
+#' @return A data.frame with columns vad_segment, start, end, duration, has_voice indicating where in the audio voice is detected
+#' @export
+#' @examples 
+#' file   <- system.file(package = "audio.vadwebrtc", "extdata", "test_wav.wav")
+#' vad    <- VAD(file, mode = "normal", milliseconds = 30)
+#' vad$vad_segments
+#' voiced <- is.voiced(vad, silence_min = 0.2)
+#' voiced
+#' voiced <- is.voiced(vad, silence_min = 200, units = "milliseconds")
+#' voiced
+is.voiced <- function(x, ...){
+    UseMethod("is.voiced")
+}
+
+#' @export
+"is.voiced.webrtc-gmm" <- function(x, silence_min = ifelse(units == "milliseconds", 1000, 1), voiced_min = ifelse(units == "milliseconds", 1000, 1), units = c("seconds", "milliseconds"), ...){
+    x <- x$vad_segment
+    units <- match.arg(units)
+    silence_min <- silence_min / 1000
+    voiced_min  <- voiced_min / 1000
+    ## Consider silences smaller than 1 second as voiced
+    x$has_voice   <- ifelse(x$has_voice, x$has_voice, ifelse((x$end - x$start) < silence_min, TRUE, x$has_voice))
+    grp           <- rle(x$has_voice)
+    x$vad_segment <- rep(seq_along(grp$lengths), grp$lengths)
+    x             <- segment_collapse(x)
+    ## Consider voiced elements smaller than 1 second as silences
+    x$has_voice   <- ifelse((x$end - x$start) < voiced_min & x$has_voice, FALSE, x$has_voice)
+    grp           <- rle(x$has_voice)
+    x$vad_segment <- rep(seq_along(grp$lengths), grp$lengths)
+    x             <- segment_collapse(x)
+    if(units == "seconds"){
+    }else if(units == "milliseconds"){
+        x$start <- x$start * 1000
+        x$end   <- x$end * 1000
+    }
+    x$duration <- x$end - x$start
+    x <- x[, c("vad_segment", "start", "end", "duration", "has_voice"), drop = FALSE]
+    x
+}
+
+
+segment_collapse <- function(x){
+    x <- do.call(rbind, lapply(split(x, list(x$vad_segment, x$has_voice), drop = TRUE), FUN = function(x){
+        data.frame(vad_segment = head(x$vad_segment, n = 1), 
+                   start       = min(x$start), 
+                   end         = max(x$end), 
+                   has_voice   = head(x$has_voice, n = 1), 
+                   stringsAsFactors = FALSE)
+    }))
+    x <- x[order(x$vad_segment, decreasing = FALSE), ]
+    rownames(x) <- NULL
+    x
+}
diff --git a/man/VAD.Rd b/man/VAD.Rd
diff --git a/man/is.voiced.Rd b/man/is.voiced.Rd