Merge pull request #1057 from wadpac/issue1056-ad-hoc-header

fix handling of ad hoc csv file header
wadpac · Feb 22, 2024 · 4f38ec0 · 4f38ec0
2 parents c6c7375 + bd129f9
commit 4f38ec0
Show file tree

Hide file tree

Showing 7 changed files with 165 additions and 47 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 # CHANGES IN GGIR VERSION 3.0-6
 
+- Part 1: Fix handling of ad hoc csv file header in g.inspectfile() #1057
 
 - Part 1: Improve g.calibrate to better handle scenario when no non-movement periods are found in the entire recording #1032
 

diff --git a/R/check_params.R b/R/check_params.R
@@ -56,7 +56,7 @@ check_params = function(params_sleep = c(), params_metrics = c(),
     numeric_params = c("chunksize", "spherecrit", "minloadcrit", "minimumFileSizeMB", "dynrange",
                        "rmc.col.acc", "interpolationType",
                        "rmc.firstrow.acc", "rmc.firstrow.header", "rmc.header.length",
-                       "rmc.col.temp", "rmc.col.time", "rmc.bitrate", "rmc.dynamic_range",
+                       "rmc.col.temp", "rmc.col.time",
                        "rmc.sf", "rmc.col.wear", "rmc.noise", "frequency_tol", "rmc.scalefactor.acc")
     boolean_params = c("printsummary", "do.cal", "rmc.unsignedbit", "rmc.check4timegaps", "rmc.doresample",
                        "imputeTimegaps")

diff --git a/R/g.getmeta.R b/R/g.getmeta.R
@@ -25,16 +25,6 @@ g.getmeta = function(datafile, params_metrics = c(), params_rawdata = c(),
     params_general = params$params_general
     params_cleaning = params$params_cleaning
   }
-  #get input variables
-  if (length(input) > 0) {
-    for (i in 1:length(names(input))) {
-      txt = paste0(names(input)[i], "=", input[i])
-      if (is(unlist(input[i]), "character")) {
-        txt = paste0(names(input)[i], "='", unlist(input[i]), "'")
-      }
-      eval(parse(text = txt))
-    }
-  }
 
   metrics2do = data.frame(do.bfen = params_metrics[["do.bfen"]],
                           do.enmo = params_metrics[["do.enmo"]],

diff --git a/R/g.inspectfile.R b/R/g.inspectfile.R
@@ -13,17 +13,6 @@ g.inspectfile = function(datafile, desiredtz = "", params_rawdata = c(),
     rm(params)
   }
 
-  #get input variables (relevant when read.myacc.csv is used
-  if (length(input) > 0) {
-    for (i in 1:length(names(input))) {
-      txt = paste0(names(input)[i], "=", input[i])
-      if (is(unlist(input[i]), "character")) {
-        txt = paste0(names(input)[i], "='", unlist(input[i]), "'")
-      }
-      eval(parse(text = txt))
-    }
-  }
-
   # note that if the file is an RData file then this function will not be called
   # the output of this function for the original datafile is stored inside the RData file in the form of object I
   getbrand = function(filename = c(), datafile = c()) {
@@ -208,16 +197,15 @@ g.inspectfile = function(datafile, desiredtz = "", params_rawdata = c(),
                                     rmc.scalefactor.acc = params_rawdata[["rmc.scalefactor.acc"]],
                                     desiredtz = desiredtz,
                                     configtz = configtz)
-    if (Pusercsvformat$header == "no header" || is.null(Pusercsvformat$header$sample_rate)) {
-
+    if (class(Pusercsvformat$header) == "character" && Pusercsvformat$header == "no header") {      
       sf = params_rawdata[["rmc.sf"]]
-      if (is.null(sf)) {
-        stop("\nFile header doesn't specify sample rate. Please provide rmc.sf value to process ", datafile)
-      } else if (sf == 0) {
-        stop("\nFile header doesn't specify sample rate. Please provide a non-zero rmc.sf value to process ", datafile)
-      }
     } else {
-      sf = Pusercsvformat$header$sample_rate
+      sf = as.numeric(Pusercsvformat$header["sample_rate",1])
+    }
+    if (is.null(sf) || is.na(sf)) {
+      stop("\nFile header doesn't specify sample rate. Please provide rmc.sf value to process ", datafile)
+    } else if (sf == 0) {
+      stop("\nFile header doesn't specify sample rate. Please provide a non-zero rmc.sf value to process ", datafile)
     }
   }
 
@@ -264,15 +252,7 @@ g.inspectfile = function(datafile, desiredtz = "", params_rawdata = c(),
     H = PP$header
 
   } else if (dformat == FORMAT$AD_HOC_CSV) { # csv data in a user-specified format
-
-    H = header = Pusercsvformat$header
-    if (Pusercsvformat$header != "no header") {
-      H = data.frame(name = row.names(header), value = header, stringsAsFactors = TRUE)
-    }
-    sf = params_rawdata[["rmc.sf"]]
-    if (sf == 0) {
-      stop("\nPlease provide a non-zero rmc.sf value to process ", datafile)
-    }
+    header = Pusercsvformat$header
   } else if (dformat == FORMAT$GT3X) { # gt3x
     info = try(expr = {read.gt3x::parse_gt3x_info(datafile, tz = desiredtz)},silent = TRUE)
     if (inherits(info, "try-error") == TRUE || is.null(info)) {
@@ -300,7 +280,7 @@ g.inspectfile = function(datafile, desiredtz = "", params_rawdata = c(),
     stop(paste0("\nSample frequency not recognised in ", datafile), call. = FALSE)
   }
 
-  if (is.null(sf) == FALSE) {
+  if (dformat != FORMAT$AD_HOC_CSV && is.null(sf) == FALSE) {
     H = as.matrix(H)
     if (ncol(H) == 3 && dformat == FORMAT$CSV && mon == MONITOR$ACTIGRAPH) {
       if (length(which(is.na(H[,2]) == FALSE)) == 0) {
@@ -333,7 +313,7 @@ g.inspectfile = function(datafile, desiredtz = "", params_rawdata = c(),
       if ((mon == MONITOR$GENEACTIV && dformat == FORMAT$BIN) || (mon == MONITOR$MOVISENS && length(H) > 0)) {
         varname = rownames(as.matrix(H))
         H = data.frame(varname = varname,varvalue = as.character(H), stringsAsFactors = TRUE)
-      } else {
+      } else if (dformat != FORMAT$AD_HOC_CSV) {
         if (length(H) > 1 && class(H)[1] == "matrix") H = data.frame(varname = H[,1],varvalue = H[,2], stringsAsFactors = TRUE)
       }
     }

diff --git a/R/read.myacc.csv.R b/R/read.myacc.csv.R
@@ -92,6 +92,9 @@ read.myacc.csv = function(rmc.file=c(), rmc.nrow=Inf, rmc.skip=c(), rmc.dec=".",
                                      dec = rmc.dec, showProgress = FALSE, header = FALSE,
                                      blank.lines.skip = TRUE,
                                      data.table=FALSE, stringsAsFactors=FALSE)
+      validrows = which(is.na(header_tmp[,1]) == FALSE & header_tmp[,1] != "")
+      header_tmp = header_tmp[validrows,1:2]
+
       options(warn = 0)
       if (length(rmc.header.structure) != 0) { # header is stored in 1 column, with strings that need to be split
         if (length(header_tmp) == 1) { # one header item
@@ -119,8 +122,6 @@ read.myacc.csv = function(rmc.file=c(), rmc.nrow=Inf, rmc.skip=c(), rmc.dec=".",
         header = header_tmp2
       } else { # column 1 is header name, column 2 is header value
         colnames(header_tmp) = NULL
-        validrows = which(is.na(header_tmp[,1]) == FALSE & header_tmp[,1] != "")
-        header_tmp = header_tmp[validrows,1:2]
         header_tmp2 = as.data.frame(header_tmp[,2], stringsAsFactors = FALSE)
         row.names(header_tmp2) = header_tmp[,1]
         colnames(header_tmp2) = NULL
@@ -142,7 +143,7 @@ read.myacc.csv = function(rmc.file=c(), rmc.nrow=Inf, rmc.skip=c(), rmc.dec=".",
         # first see if maybe sf *is* in the header, just not under the rmc.headername.sf name
         sf = as.numeric(header[which(row.names(header) == "sample_rate"),1])
         # if sf isn't in the header under the default name either, then use the default value
-        if (is.na(sf)) {
+        if (is.na(sf) && !is.null(rmc.sf)) {
           sf = rmc.sf
           header = rbind(header, sf) # add it also to the header
           row.names(header)[nrow(header)] = "sample_rate"

diff --git a/tests/testthat/test_greadaccfile.R b/tests/testthat/test_greadaccfile.R
@@ -1,6 +1,6 @@
 library(GGIR)
 context("g.readaccfile")
-test_that("g.readaccfile and g.inspectfile can read movisens, gt3x, cwa, Axivity csv, and actigraph csv files correctly", {
+test_that("g.readaccfile and g.inspectfile can read movisens, gt3x, cwa, Axivity csv, actigraph csv, and ad-hoc csv files correctly", {
   skip_on_cran()
 
   desiredtz = "Pacific/Auckland"
@@ -218,11 +218,25 @@ test_that("g.readaccfile and g.inspectfile can read movisens, gt3x, cwa, Axivity
   timestamps = as.POSIXlt(x, origin="1970-1-1", tz = configtz)
   mydata = data.frame(Xcol = rnorm(N), timecol = timestamps, Ycol = rnorm(N), Zcol = rnorm(N),
             tempcol = rnorm(N) + 20)
-  testfile = "testcsv1.csv"
+  testfile = "testcsv.csv"
   on.exit({if (file.exists(testfile)) file.remove(testfile)}, add = TRUE)
 
   write.csv(mydata, file = testfile, row.names = FALSE)
 
+  # check that for files with no header, g.inspectfile() errors out if sampling rate is not specified as rmc.sf, or if rmc.sf == 0
+  expect_error(g.inspectfile(testfile, 
+                             rmc.dec=".", rmc.unit.time="POSIX",
+                             rmc.firstrow.acc = 1, rmc.firstrow.header=c(),
+                             rmc.col.acc = c(1,3,4), rmc.col.temp = 5, rmc.col.time=2,
+                             rmc.unit.acc = "g", rmc.unit.temp = "C", rmc.origin = "1970-01-01"),
+              regexp = "File header doesn't specify sample rate. Please provide rmc.sf value to process")
+  expect_error(g.inspectfile(testfile, 
+                             rmc.dec=".", rmc.sf=0, rmc.unit.time="POSIX",
+                             rmc.firstrow.acc = 1, rmc.firstrow.header=c(),
+                             rmc.col.acc = c(1,3,4), rmc.col.temp = 5, rmc.col.time=2,
+                             rmc.unit.acc = "g", rmc.unit.temp = "C", rmc.origin = "1970-01-01"),
+              regexp = "File header doesn't specify sample rate. Please provide a non-zero rmc.sf value to process")
+
   AHcsv = g.inspectfile(testfile, 
                         rmc.dec=".", rmc.sf=30, rmc.unit.time="POSIX",
                         rmc.firstrow.acc = 1, rmc.firstrow.header=c(),
@@ -289,6 +303,137 @@ test_that("g.readaccfile and g.inspectfile can read movisens, gt3x, cwa, Axivity
   expect_equal(nrow(csv_read4$P$data), 3000)
   expect_equal(sum(csv_read3$P$data[c("x","y","z")]), sum(csv_read4$P$data[c("x","y","z")]), tolerance = .01, scale = 1)
 
+  # Create test file: 2-column header, with time,
+  # but sample rate not specified in the header
+
+  N = 6000
+  sf = 30
+  x = Sys.time()+((0:(N-1))/sf)
+  timestamps = as.POSIXlt(x, origin="1970-1-1", tz = configtz)
+  mydata = data.frame(Xcol = rnorm(N), timecol = timestamps, Ycol = rnorm(N), Zcol = rnorm(N))
+  S1 = as.matrix(mydata)
+
+  hd_NR = 10
+  hd = matrix("", hd_NR + 1, ncol(S1))
+  hd[1, 1:2] = c("ID","12345")
+  hd[2, 1:2] = c("serial_number","30")
+  hd[3, 1:2] = c("bit","8")
+  hd[4, 1:2] = c("dynamic_range","6")
+
+  S1 = rbind(hd, S1)
+  S1[hd_NR + 1,] = colnames(S1)
+  colnames(S1) = NULL
+
+  testfile_two_col = "testcsv2col.csv"
+  on.exit({if (file.exists(testfile_two_col)) file.remove(testfile_two_col)}, add = TRUE)
+  write.table(S1, file = testfile_two_col, col.names = FALSE, row.names = FALSE)
+
+  # Create test file: 1-column header, with time,
+  # but sample rate not specified in the header
+  S1 = as.matrix(mydata)
+  hd = matrix("", hd_NR + 1, ncol(S1))
+  hd[1, 1:2] = c("ID: 12345", "")
+  hd[2, 1:2] = c("serial_number: 4321", "")
+  hd[3, 1:2] = c("bit: 8", "")
+  hd[4, 1:2] = c("dynamic_range: 6", "")
+
+  S1 = as.matrix(mydata)
+  S1 = rbind(hd, S1)
+  S1[hd_NR + 1,] = colnames(S1)
+  colnames(S1) = NULL
+
+  testfile_one_col = "testcsv1col.csv"
+  on.exit({if (file.exists(testfile_one_col)) file.remove(testfile_one_col)}, add = TRUE)
+  write.table(S1, file = testfile_one_col, col.names = FALSE, row.names = FALSE)
+
+  for (testfile in c(testfile_one_col, testfile_two_col)) {
+    # check that for a file whose header doesn't specify sampling rate,
+    # g.inspectfile() errors out if sampling rate is not specified as rmc.sf, or if rmc.sf==0
+    expect_error(g.inspectfile(testfile, 
+                               rmc.dec=".", rmc.unit.time="POSIX",
+                               rmc.firstrow.acc = 11, rmc.firstrow.header = 1,
+                               rmc.col.acc = c(1,3,4), rmc.col.time=2,
+                               rmc.unit.acc = "g", rmc.origin = "1970-01-01"),
+                regexp = "File header doesn't specify sample rate. Please provide rmc.sf value to process")
+    expect_error(g.inspectfile(testfile, 
+                               rmc.dec=".", rmc.sf = 0, rmc.unit.time="POSIX",
+                               rmc.firstrow.acc = 11, rmc.firstrow.header = 1,
+                               rmc.col.acc = c(1,3,4), rmc.col.time=2,
+                               rmc.unit.acc = "g", rmc.origin = "1970-01-01"),
+                regexp = "File header doesn't specify sample rate. Please provide a non-zero rmc.sf value to process")
+
+    # check that for a file whose header doesn't specify sampling rate,
+    # g.inspectfile() returns sf == rmc.sf if the latter was specified
+    I = g.inspectfile(testfile, 
+                      rmc.dec=".", rmc.sf = 80, rmc.unit.time="POSIX",
+                      rmc.firstrow.acc = 11, rmc.firstrow.header = 1,
+                      rmc.col.acc = c(1,3,4), rmc.col.time=2,
+                      rmc.unit.acc = "g", rmc.origin = "1970-01-01")
+    expect_equal(I$sf, 80)
+  }
+
+  # Create test file: 2-column header, with temperature, with time,
+  # and sample rate correctly specified in the header
+  S1 = as.matrix(mydata)
+  hd_NR = 10
+  hd = matrix("", hd_NR + 1, ncol(S1))
+  hd[1, 1:2] = c("ID","12345")
+  hd[2, 1:2] = c("sample_freq","40")
+  hd[3, 1:2] = c("serial_number","9876")
+  hd[4, 1:2] = c("bit","8")
+  hd[5, 1:2] = c("dynamic_range","6")
+  S1 = as.matrix(mydata)
+  S1 = rbind(hd, S1)
+  S1[hd_NR + 1,] = colnames(S1)
+  colnames(S1) = NULL
+
+  testfile_two_col = "testcsv2col.csv"
+  on.exit({if (file.exists(testfile_two_col)) file.remove(testfile_two_col)}, add = TRUE)
+  write.table(S1, file = testfile_two_col, col.names = FALSE, row.names = FALSE)
+
+  # Create test file: 1-column header, with time,
+  # and sample rate not specified in the header
+  S1 = as.matrix(mydata)
+  hd = matrix("", hd_NR + 1, ncol(S1))
+  hd[1, 1:2] = c("ID: 12345", "")
+  hd[2, 1:2] = c("sample_freq: 40", "")
+  hd[3, 1:2] = c("serial_number: 4321", "")
+  hd[4, 1:2] = c("bit: 8", "")
+  hd[5, 1:2] = c("dynamic_range: 6", "")
+  S1 = rbind(hd, S1)
+  S1[hd_NR + 1,] = colnames(S1)
+  colnames(S1) = NULL
+
+  testfile_one_col = "testcsv1col.csv"
+  on.exit({if (file.exists(testfile_one_col)) file.remove(testfile_one_col)}, add = TRUE)
+  write.table(S1, file = testfile_one_col, col.names = FALSE, row.names = FALSE)
+
+  for (csvData in list(list(testfile_one_col, ": "),
+                       list(testfile_two_col, c()))) {
+    # check that g.inspectfile() returns sf value that was specified in the header, even if rmc.sf was also specified
+    I = g.inspectfile(csvData[[1]], 
+                      rmc.dec=".", rmc.sf = 80, rmc.headername.sf = "sample_freq", 
+                      rmc.unit.time="POSIX",
+                      rmc.firstrow.acc = 11, rmc.firstrow.header=1,
+                      rmc.col.acc = c(1,3,4), rmc.col.time=2,
+                      rmc.unit.acc = "g", rmc.origin = "1970-01-01",
+                      rmc.headername.sn = "serial_number",
+                      rmc.headername.recordingid = "ID",
+                      rmc.bitrate = "bit", rmc.dynamic_range = "dynamic_range",
+                      rmc.header.structure = csvData[[2]])
+
+    expect_equal(I$sf, 40)
+
+    # check that g.inspectfile() correctly reads the sf value from the header
+    I = g.inspectfile(csvData[[1]], 
+                      rmc.dec=".", rmc.headername.sf = "sample_freq",
+                      rmc.unit.time="POSIX",
+                      rmc.firstrow.acc = 11, rmc.firstrow.header=1,
+                      rmc.col.acc = c(1,3,4), rmc.col.time=2,
+                      rmc.unit.acc = "g", rmc.origin = "1970-01-01",
+                      rmc.header.structure = csvData[[2]])
+    expect_equal(I$sf, 40)
+  }
   # test decimal separator recognition extraction
   decn =  g.dotorcomma(Ax3CwaFile,dformat = FORMAT$CWA, mon = MONITOR$AXIVITY, desiredtz = desiredtz)
   expect_equal(decn,".")

diff --git a/tests/testthat/test_read.myacc.csv.R b/tests/testthat/test_read.myacc.csv.R
@@ -265,14 +265,15 @@ test_that("read.myacc.csv can handle header and bit-value acceleration", {
                       rmc.format.time = "%Y-%m-%d %H:%M:%OS",
                       rmc.origin = "1970-01-01",
                       desiredtz = "Europe/London",
-                      rmc.sf = sf,
                       rmc.headername.sf = "sample_frequency",
                       rmc.headername.sn = "serial_number",
                       rmc.headername.recordingid = "ID")
   expect_that(nrow(D1$data), equals(20))
   expect_that(ncol(D1$data), equals(5))
   expect_that(nrow(D1$header), equals(5))
   expect_that(ncol(D1$header), equals(1))
+  expect_equal(as.numeric(D1$header["sample_rate",1]), 30)
+
   # Test 2 - 2 column header, bit-valued acceleration
   D2 = read.myacc.csv(rmc.file = testfile[2], rmc.nrow = 20, rmc.dec = ".",
                       rmc.firstrow.acc = 11, rmc.firstrow.header = 1,