From 577b2995244c3e4b80d75032d269aaafb03b3f8d Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 16 Sep 2024 17:30:29 +0000 Subject: [PATCH] CHOP|PROV|JHU only if checking for missiness in NAACCR code columns --- scripts/case_selection/shared_fxns.R | 12 +++-- .../case_selection/tests/test_shared_fxns.R | 51 +++++++------------ 2 files changed, 27 insertions(+), 36 deletions(-) diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R index 08688c9c..c70e9d5f 100644 --- a/scripts/case_selection/shared_fxns.R +++ b/scripts/case_selection/shared_fxns.R @@ -294,15 +294,19 @@ remap_patient_characteristics <- function(clinical, existing_patients, ethnicity #' @param data The data frame to check against #' @param columns The target columns check_for_missing_values <- function(data, columns) { - # filter out CHOP, PROV, JHU centers with known NAs - data <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ] # Check for NA values or empty strings missingness_col <- c() for (col in columns) { - if (any(is.na(data[[col]]) | data[[col]] == "" )){ + if (col %in% c("naaccr_race_code_tertiary", "naaccr_race_code_secondary")) { + # filter out CHOP, PROV, JHU centers with known NAs in NAACCR code columns + relevant_rows <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ] + } else{ + relevant_rows <- data + } + if (any(is.na(relevant_rows[[col]]) | relevant_rows[[col]] == "" )){ missingness_col <- c(col, missingness_col) } - } + } if (length(missingness_col) > 0) { warning(paste0("Warning: Missing or empty values found in column(s): ", paste(missingness_col,collapse=", "))) } diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R index 93e5ecf7..fdd7fdb3 100644 --- a/scripts/case_selection/tests/test_shared_fxns.R +++ b/scripts/case_selection/tests/test_shared_fxns.R @@ -104,51 +104,38 @@ test_that("remap_patient_characteristics works as expected", { expect_equal(result, expected_output) }) -test_that("check_for_missing_values - no missing or empty values in centers other than CHOP, PROV, JHU", { +test_that("check_for_missing_values - no missing or empty values", { data <- data.frame( - col1 = c(1, 2, 3, NA), - col2 = c("a", "b", "c", ""), - genie_patient_id = c('a', 'b', 'c', 'CHOP123') + col1 = c(1, 2, 3), + col2 = c("a", "b", "c"), + genie_patient_id = c("a", "b", "CHOP123"), + naaccr_race_code_tertiary = c("a", "b", "c"), + naaccr_race_code_secondary = c("a", "b", "c") ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), NA) + expect_no_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary"))) }) -test_that("check_for_missing_values - NAs are detected in centers other than CHOP, PROV, JHU", { +test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in centers other than CHOP, PROV, JHU", { data <- data.frame( - col1 = c(1, NA, 3), + col1 = c(1, NA, ""), col2 = c("a", "b", "c"), - genie_patient_id = c('CHOP123', 'b', 'PROV234') + genie_patient_id = c("CHOP123", "b", "PROV234"), + naaccr_race_code_tertiary = c("a", "", "c"), + naaccr_race_code_secondary = c("a", "b", "c") ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), - "Warning: Missing or empty values found in column\\(s\\): col1") + expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")), + "Warning: Missing or empty values found in column\\(s\\): naaccr_race_code_tertiary, col1") }) -test_that("check_for_missing_values - empty string values are detected in centers other than CHOP, PROV, JHU", { - data <- data.frame( - col1 = c(1, 2, 3), - col2 = c("a", "", "c"), - genie_patient_id = c('CHOP123', 'b', 'PROV234') - ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), - "Warning: Missing or empty values found in column\\(s\\): col2") -}) - -test_that("check_for_missing_values - multiple missing and empty values are detected in centers other than CHOP, PROV, JHU", { +test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in CHOP, PROV, JHU centers", { data <- data.frame( col1 = c(1, NA, ""), col2 = c("a", "", "c"), - genie_patient_id = c('CHOP123', 'b', 'PROV234') + genie_patient_id = c("CHOP123", "b", "PROV234"), + naaccr_race_code_tertiary = c("", "b", "c"), + naaccr_race_code_secondary = c("a", "b", NA) ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), + expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")), "Warning: Missing or empty values found in column\\(s\\): col2, col1") }) - -test_that("check_for_missing_values - multiple missing and empty values are detected in CHOP, PROV, JHU centers", { - data <- data.frame( - col1 = c(1, NA, 2), - col2 = c("a", "", "c"), - genie_patient_id = c('a', 'CHOP123', 'PROV234') - ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), NA) -}) \ No newline at end of file