Skip to content

Commit

Permalink
Allow to analyse by 10/20/30 milliseconds. Change the internals such …
Browse files Browse the repository at this point in the history
…that this is passed on instead of the frame_length
  • Loading branch information
jwijffels committed Jan 23, 2024
1 parent e39f620 commit 9ae3f26
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 37 deletions.
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

vad_webrtc_detection <- function(file, mode = 3L, frame_length = 160L) {
.Call('_audio_vadwebrtc_vad_webrtc_detection', PACKAGE = 'audio.vadwebrtc', file, mode, frame_length)
vad_webrtc_detection <- function(file, mode = 3L, milliseconds = 10L) {
.Call('_audio_vadwebrtc_vad_webrtc_detection', PACKAGE = 'audio.vadwebrtc', file, mode, milliseconds)
}

49 changes: 32 additions & 17 deletions R/vad.R
Original file line number Diff line number Diff line change
@@ -1,27 +1,37 @@
#' @title Voice Activity Detection
#' @description Detect the location of active Voice Activity Detection using a Gaussian Mixture Model implemented in the "webrtc" framework.
#' @param file the path to an audio file which should be a file in 16 bit with mono PCM samples (pcm_s16le codec) with a sampling rate of either 8Khz, 16KHz or 32Khz
#' @param mode the type of voice detection, either 'normal', 'lowbitrate', 'aggressive' or 'veryaggressive'
#' @param mode the type of voice detection, either 'normal', 'lowbitrate', 'aggressive' or 'veryaggressive' where 'veryaggressive' means more silences are detected
#' @param milliseconds integer with the number of milliseconds indicating to compute by this number of milliseconds the VAD signal. Can only be 10, 20 or 30. Defaults to 10.
#' @param type character string with the type of VAD model. Only 'webrtc' currently.
#' @return a list with elements
#' \itemize{
#' \item{file: the path to the file}
#' \item{sample_rate: the sample rate in Hz}
#' \item{channels: the number of channels in the audio, as the algorithm requires mono audio this should only be 1}
#' \item{sample_rate: the sample rate of the audio file in Hz}
#' \item{channels: the number of channels in the audio - as the algorithm requires mono audio this should only be 1}
#' \item{samples: the number of samples in the data}
#' \item{format: the format - should}
#' \item{type: the type of model - currently only 'webrtc-gmm'}
#' \item{bitsPerSample: the number of bits per sample}
#' \item{bytesPerSample: the number of bytes per sample}
#' \item{type: the type of VAD model - currently only 'webrtc-gmm'}
#' \item{mode: the provided VAD mode}
#' \item{bitsPerSample}
#' \item{bytesPerSample}
#' \item{milliseconds: the provided milliseconds - either by 10, 20 or 30 ms frames}
#' \item{frame_length: the frame length corresponding to the provided milliseconds}
#' \item{vad: a data.frame with columns millisecond, has_voice and vad_segment indicating if the audio contains an active voice signal at that millisecond}
#' \item{vad_segments: a data.frame with columns vad_segment, start, end and has_voice where the start/end values are in seconds}
#' \item{vad_stats: a list with elements n_segments, n_segments_has_voice, n_segments_has_no_voice, seconds_has_voice, seconds_has_no_voice, pct_has_voice indicating the number of segments with voice and the duration of the voice/non-voice in the audio}
#' }
#' @export
#' @examples
#' file <- system.file(package = "audio.vadwebrtc", "extdata", "test_wav.wav")
#' vad <- VAD(file, mode = "normal")
#' vad <- VAD(file, mode = "normal", milliseconds = 30)
#' vad
#' vad <- VAD(file, mode = "lowbitrate", milliseconds = 20)
#' vad
#' vad <- VAD(file, mode = "aggressive", milliseconds = 20)
#' vad
#' vad <- VAD(file, mode = "veryaggressive", milliseconds = 20)
#' vad
#' vad <- VAD(file, mode = "normal", milliseconds = 10)
#' vad
#' vad$vad_segments
#' \dontrun{
Expand All @@ -41,17 +51,21 @@
#' vad <- VAD(file, mode = "normal")
#' vad
#' vad$vad_segments
#' vad$vad_segments_info
VAD <- function(file, mode = c("normal", "lowbitrate", "aggressive", "veryaggressive"), type = "webrtc"){
#' vad$vad_stats
VAD <- function(file, mode = c("normal", "lowbitrate", "aggressive", "veryaggressive"), milliseconds = 10L, type = "webrtc"){
type <- match.arg(type)
mode <- match.arg(mode)
stopifnot(file.exists(file))
msg <- vad_webrtc_detection(file, mode = switch(mode,
normal = 0,
lowbitrate = 1,
aggressive = 2,
veryaggressive = 3))
## Get groups
milliseconds <- as.integer(milliseconds)
stopifnot(milliseconds %in% c(10, 20, 30))
msg <- vad_webrtc_detection(file,
mode = switch(mode,
normal = 0,
lowbitrate = 1,
aggressive = 2,
veryaggressive = 3),
milliseconds = milliseconds)
## Get groups of sequences of voice/non-voice
grp <- rle(msg$vad$has_voice)
msg$type <- "webrtc-gmm"
msg$mode <- mode
Expand All @@ -62,6 +76,7 @@ VAD <- function(file, mode = c("normal", "lowbitrate", "aggressive", "veryaggres
end = vapply(segments, FUN = function(x) x[2], FUN.VALUE = integer(1), USE.NAMES = FALSE) / 1000,
has_voice = grp$values)
msg$vad_segments <- segments
## Calculate the percentage of voiced signal
vad_segments_info <- list(
n_segments = nrow(segments),
n_segments_has_voice = sum(segments$has_voice, na.rm = TRUE),
Expand All @@ -79,7 +94,7 @@ print.VAD <- function(x, ...){
cat("Voice Activity Detection", "\n")
cat(" - file:", x$file, "\n")
cat(" - sample rate:", x$sample_rate, "\n")
cat(" - VAD type: ", x$type, ", VAD mode: ", x$mode, "\n", sep = "")
cat(" - VAD type: ", x$type, ", VAD mode: ", x$mode, ", VAD by milliseconds: ", x$milliseconds, ", VAD frame_length: ", x$frame_length, "\n", sep = "")
cat(" - Percent of audio containing a voiced signal:", paste(round(100*x$vad_stats$pct_has_voice, digits = 1), "%", sep = ""), "\n")
cat(" - Seconds voiced:", round(x$vad_stats$seconds_has_voice, digits = 1), "\n")
cat(" - Seconds unvoiced:", round(x$vad_stats$seconds_has_no_voice, digits = 1), "\n")
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ vad
Voice Activity Detection
- file: D:/Jan/R/win-library/4.1/audio.vadwebrtc/extdata/test_wav.wav
- sample rate: 16000
- VAD type: webrtc-gmm, VAD mode: normal
- VAD type: webrtc-gmm, VAD mode: normal, VAD by milliseconds: 10, VAD frame_length: 160
- Percent of audio containing a voiced signal: 90.2%
- Seconds voiced: 6.3
- Seconds unvoiced: 0.7
Expand Down
30 changes: 21 additions & 9 deletions man/VAD.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif

// vad_webrtc_detection
Rcpp::List vad_webrtc_detection(std::string file, int mode, size_t frame_length);
RcppExport SEXP _audio_vadwebrtc_vad_webrtc_detection(SEXP fileSEXP, SEXP modeSEXP, SEXP frame_lengthSEXP) {
Rcpp::List vad_webrtc_detection(std::string file, int mode, int milliseconds);
RcppExport SEXP _audio_vadwebrtc_vad_webrtc_detection(SEXP fileSEXP, SEXP modeSEXP, SEXP millisecondsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type file(fileSEXP);
Rcpp::traits::input_parameter< int >::type mode(modeSEXP);
Rcpp::traits::input_parameter< size_t >::type frame_length(frame_lengthSEXP);
rcpp_result_gen = Rcpp::wrap(vad_webrtc_detection(file, mode, frame_length));
Rcpp::traits::input_parameter< int >::type milliseconds(millisecondsSEXP);
rcpp_result_gen = Rcpp::wrap(vad_webrtc_detection(file, mode, milliseconds));
return rcpp_result_gen;
END_RCPP
}
Expand Down
13 changes: 9 additions & 4 deletions src/rcpp_webrtc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ VadInst *vad_init(VadInst *vadptr){
}

// [[Rcpp::export]]
Rcpp::List vad_webrtc_detection(std::string file, int mode = 3, size_t frame_length = 160) {
Rcpp::List vad_webrtc_detection(std::string file, int mode = 3, int milliseconds = 10) {
VadInst* vad = vad_create();
vad = vad_init(vad);
if (vad) {
Expand All @@ -76,16 +76,19 @@ Rcpp::List vad_webrtc_detection(std::string file, int mode = 3, size_t frame_len
// Check on the combination of the frame length and the sample_rate
Rcpp::LogicalVector detections;
Rcpp::IntegerVector detections_ms;

size_t frame_length = 160;
frame_length = (size_t)((Rcpp::as<int>(output["sample_rate"]) / 1000) * milliseconds);
int ok = WebRtcVad_ValidRateAndFrameLength(output["sample_rate"], frame_length);
if (ok < 0) {
Rcpp::stop("Invalid combination of Hz and frame_length. We support 10, 20 and 30 ms frames and the rates 8000, 16000 and 32000 Hz.");
}
//int step = 160;
const int16_t * temp = pcm16.data();
// currently checking in 10ms frames, most likely to change
for(unsigned int i=0, ms=0; i < pcm16.size(); i+=160, ms+=10){
int isActive = WebRtcVad_Process(vad, output["sample_rate"], temp, frame_length); // 1 = voice , 0 = not voice
temp = temp + 160;
for(unsigned int i=0, ms=0; i < pcm16.size(); i+=frame_length, ms+=milliseconds){
int isActive = WebRtcVad_Process(vad, output["sample_rate"], temp, frame_length); // 1 = voice , 0 = not voice, -1 = error
temp = temp + frame_length;
if(isActive < 0){
detections.push_back(NA_LOGICAL);
}else{
Expand All @@ -94,6 +97,8 @@ Rcpp::List vad_webrtc_detection(std::string file, int mode = 3, size_t frame_len
detections_ms.push_back(ms);
}
vad_free(vad);
output["milliseconds"] = milliseconds;
output["frame_length"] = frame_length;
output["vad"] = Rcpp::DataFrame::create(
Rcpp::Named("millisecond") = detections_ms,
Rcpp::Named("has_voice") = detections);
Expand Down

0 comments on commit 9ae3f26

Please sign in to comment.