diff --git a/R/compare_sessions.R b/R/compare_sessions.R index 10da6112..7b68ff96 100644 --- a/R/compare_sessions.R +++ b/R/compare_sessions.R @@ -4,8 +4,8 @@ #' It compares csv outputs from two sessions, finds their differences, and asks for a consensus. \cr \cr #' #' @param session_dir This directory should contain 2 csv files for each session (LOG_ and OUTPUT_), 4 csv files in total. -#' @param session1_base Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-05_16-07-38.599493' -#' @param session2_base Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-08_12-03-30.429336' +#' @param session1_base Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-05-16-07-38' +#' @param session2_base Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-08-12-03-30' #' @param json_file The full path to the metadata file used when running domain_mapping (should be the same for session 1 and session 2) #' @param domain_file The full path to the domain file used when running domain_mapping (should be the same for session 1 and session 2) #' @return It returns a csv output, which represents the consensus decisions between session 1 and session 2 diff --git a/R/domain_mapping.R b/R/domain_mapping.R index 9cb50155..ed77265c 100755 --- a/R/domain_mapping.R +++ b/R/domain_mapping.R @@ -2,14 +2,15 @@ #' #' This function will read in the metadata file for a chosen dataset, loop through all the data elements, and ask the user to catergorise/label each data element as belonging to one or more domains.\cr \cr #' The domains will appear in the Plots tab and dataset information will be printed to the R console, for the user's reference in making these categorisations. \cr \cr -#' A log file will be saved with the catergorisations made. -#' To speed up this process, some auto-categorisations will be made by the function for commonly occurring data elements. \cr \cr +#' These categorisations will be saved to a csv file, alongside a log file which summarises the session details. +#' To speed up this process, some auto-categorisations will be made by the function for commonly occurring data elements and categorisations for the same data element can be copied from one table to another. \cr \cr #' Example inputs are provided within the package data, for the user to run this function in a demo mode. #' @param json_file The metadata file. This should be downloaded from the metadata catalogue as a json file. See 'data-raw/maternity_indicators_dataset_(mids)_20240105T132210.json' for an example download. #' @param domain_file The domain list file. This should be a csv file created by the user, with each domain listed on a separate line. See 'data-raw/domain_list_demo.csv' for a template. #' @param look_up_file The look-up table file, with auto-categorisations. By default, the code uses 'data/look-up.rda'. The user can provide their own look-up table in the same format as 'data-raw/look-up.csv'. -#' @param output_dir The path to the directory where the csv output log will be saved. By default, the current working directory is used. -#' @return The function will return a log file with the mapping between data elements and domains, alongside details about the dataset. +#' @param output_dir The path to the directory where the two csv output files will be saved. By default, the current working directory is used. +#' @param table_copy Turn on copying between tables (TRUE or FALSE, default TRUE). If TRUE, categorisations you make for the last table you processed will be carried over to another, as long as the csv files share an output_dir. +#' @return The function will return two csv files: 'OUTPUT_' which contains the mappings and 'LOG_' which contains details about the dataset and session. #' @examples #' # Run in demo mode by providing no inputs: domain_mapping() #' # Demo mode will use the /data files provided in this package @@ -20,7 +21,7 @@ #' @importFrom utils read.csv write.csv #' @importFrom dplyr %>% arrange count group_by -domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = NULL, output_dir = NULL) { +domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = NULL, output_dir = NULL, table_copy = TRUE) { ## Load data: Check if demo data should be used ---- @@ -30,7 +31,7 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = domains <- get("domain_list") DomainListDesc <- "DemoList" cat("\n") - cli_alert_success("Running domain_mapping in demo mode using package data files") + cli_alert_info("Running domain_mapping in demo mode using package data files") demo_mode = TRUE } else if (is.null(json_file) || is.null(domain_file)) { # If only one of json_file and domain_file is NULL, throw error @@ -48,14 +49,18 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = # Check if user has provided a look-up table if (is.null(look_up_file)) { - cli_alert_success("Using the default look-up table in data/look-up.rda") + cli_alert_info("Using the default look-up table in data/look-up.rda") lookup <- get("look_up") } else { lookup <- read.csv(look_up_file) - cli_alert_success("Using look up file inputted by user") + cli_alert_info("Using look up file inputted by user") print(lookup) - } + } + + # If user has not provider output_dir, use current working dir: + if (is.null(output_dir)) { + output_dir = getwd() } ## Present domains plots panel for user's reference ---- colnames(domains)[1] = "Domain Name" @@ -127,6 +132,20 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = cli_h1("Table Last Updated") cat(meta_json$dataModel$childDataClasses[[dc]]$lastUpdated, "\n", fill = TRUE) + # Check if previous table output exists in this output_dir (for table copying) + if (table_copy == TRUE){ + dataset_search = paste0("^OUTPUT_",gsub(" ", "", meta_json$dataModel$label),'*') + csv_list <- data.frame(file = list.files(output_dir,pattern = dataset_search)) + if (nrow(csv_list) != 0){ + csv_list$date <- as.POSIXct(substring(csv_list$file,nchar(csv_list$file)-22,nchar(csv_list$file)-4), format="%Y-%m-%d-%H-%M-%S") + csv_last_filename <- csv_list[which.min(csv_list$date),] + csv_last <- read.csv(paste0(output_dir,'/',csv_last_filename$file)) + csv_last_exist <- TRUE + cat("\n") + cli_alert_info(paste0("Copying from previous session: ",csv_last_filename$file)) + } else {csv_last_exist <- FALSE} + } else {csv_last_exist <- FALSE} + table_desc <- "" while (table_desc != "Y" & table_desc != "y" & table_desc != "N" & table_desc != "n") { cat("\n \n") @@ -155,8 +174,7 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = selectTable_df <- selectTable_df[order(selectTable_df$Label), ] # Create unique output csv to log the results ---- - timestamp_now <- gsub(" ", "_", Sys.time()) - timestamp_now <- gsub(":", "-", timestamp_now) + timestamp_now <- format(Sys.time(),"%Y-%m-%d-%H-%M-%S") output_fname_csv <- paste0("OUTPUT_", gsub(" ", "", meta_json$dataModel$label), "_", gsub(" ", "", meta_json$dataModel$childDataClasses[[dc]]$label), "_", timestamp_now, ".csv") output_fname_log_csv <- paste0("LOG_", gsub(" ", "", meta_json$dataModel$label), "_", gsub(" ", "", meta_json$dataModel$childDataClasses[[dc]]$label), "_", timestamp_now, ".csv") @@ -200,31 +218,36 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = for (datavar in start_var:end_var) { cat("\n \n") cli_alert_info(paste(length(datavar:end_var),'left to process in this session')) - cli_alert_success("Processing data element {datavar} of {nrow(selectTable_df)}") + cli_alert_info("Processing data element {datavar} of {nrow(selectTable_df)}") + # prepare output + this_Output <- row_Output + this_Output[nrow(this_Output) + 1 , ] <- NA + this_Output$DataElement[1] <- selectTable_df$Label[datavar] + this_Output$DataElement_N[1] <- paste(as.character(datavar),'of',as.character(nrow(selectTable_df))) + # search if this data element matches with auto categorisations in lookup datavar_index <- which(lookup$DataElement == selectTable_df$Label[datavar]) #we should code this to ignore the case lookup_subset <- lookup[datavar_index,] - if (nrow(lookup_subset) == 1) { - # auto categorisations - this_Output <- row_Output - this_Output[nrow(this_Output) + 1 , ] <- NA - this_Output$DataElement[1] <- selectTable_df$Label[datavar] - this_Output$DataElement_N[1] <- paste(as.character(datavar),'of',as.character(nrow(selectTable_df))) + # search if this data element matches with any data elements processed in previous table + if (csv_last_exist == TRUE) { + datavar_index <- which(csv_last$DataElement == selectTable_df$Label[datavar]) + csv_last_subset <- csv_last[datavar_index,] + } else {csv_last_subset <- data.frame()} + # decide how to process the data element out of 3 options + if (nrow(lookup_subset) == 1) { # 1 - auto categorisation this_Output$Domain_code[1] <- lookup_subset$DomainCode this_Output$Note[1] <- "AUTO CATEGORISED" Output <- rbind(Output,this_Output) - } else { - # collect user responses - decision_output <- user_categorisation(selectTable_df$Label[datavar],selectTable_df$Description[datavar],selectTable_df$Type[datavar],max(Code$Code)) - # input user responses into output - this_Output <- row_Output - this_Output[nrow(this_Output) + 1 , ] <- NA - this_Output$DataElement[1] <- selectTable_df$Label[datavar] - this_Output$DataElement_N[1] <- paste(as.character(datavar),'of',as.character(nrow(selectTable_df))) - this_Output$Domain_code[1] <- decision_output$decision - this_Output$Note[1] <- decision_output$decision_note - Output <- rbind(Output,this_Output) - } - } # end of loop for DataElement + } else if (csv_last_exist == TRUE & nrow(csv_last_subset) == 1){ # 2 - copy from previous table + this_Output$Domain_code[1] <- csv_last_subset$Domain_code + suppressWarnings(this_Output$Note[1] <- paste0("COPIED FROM: ",csv_last_filename)) + Output <- rbind(Output,this_Output) + } else { # 3 - collect user responses + decision_output <- user_categorisation(selectTable_df$Label[datavar],selectTable_df$Description[datavar],selectTable_df$Type[datavar],max(Code$Code)) + this_Output$Domain_code[1] <- decision_output$decision + this_Output$Note[1] <- decision_output$decision_note + Output <- rbind(Output,this_Output) + } + } # end of loop for DataElement ## Print the AUTO CATEGORISED responses for this Table and request review ---- Output_auto <- subset(Output, Note == 'AUTO CATEGORISED') @@ -270,8 +293,9 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = if (review_cats == 'Y' | review_cats == 'y') { Output_not_auto <- subset(Output, Note != 'AUTO CATEGORISED') + Output_not_auto['Note (first 12 chars)'] <- substring(Output_not_auto$Note,1,11) cat("\n \n") - print(Output_not_auto[, c("DataElement", "Domain_code","Note")]) + print(Output_not_auto[, c("DataElement", "Domain_code","Note (first 12 chars)")]) cat("\n \n") # extract the rows to edit @@ -316,9 +340,6 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = ## Save final categorisations for this Table ---- Output$timestamp <- timestamp_now - if (is.null(output_dir)) { - output_dir = getwd() } - utils::write.csv(Output, paste(output_dir,output_fname_csv,sep='/'), row.names = FALSE) utils::write.csv(log_Output, paste(output_dir,output_fname_log_csv,sep='/'), row.names = FALSE) cat("\n") diff --git a/README.md b/README.md index 76884e83..644c3569 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,13 @@ This `R` package takes a metadata file as input and facilitates the process of browsing through each table within a chosen dataset. The user is asked to categorise each data element (variable) within a table into a domain related to their research question, and these categorisations get saved in a csv file -for later reference. To speed up this process, the function automatically -categorises some variables that regularly appear in health datasets -(e.g. ID, Sex, Age). +for later reference. + +To speed up this process, the function automatically categorises some variables +that regularly appear in health datasets (e.g. ID, Sex, Age). The function also +accounts for the same data element appearing in multiple tables across a dataset, +and allows the user to active a table copying function which copies categorisations +they've done for one table, onto the current table they are processing. 🚧 :warning: This package is in early development, and has only been tested on a limited number of metadata files. In theory, this package @@ -78,6 +82,10 @@ Read the documentation: ``` ?domain_mapping ``` +Set your working directory to be an empty folder you just created: +``` +setwd("/Users/your-username/test-browseMetadata") +``` Run the function in demo mode: ``` r @@ -104,8 +112,8 @@ For a research study, your domains are likely to be much more specific e.g. 'Pre The 4 default domains are always included [0-3], appended on to any domain list given. ``` -✔ Running domain_mapping in demo mode using package data files -✔ Using the default look-up table in data/look-up.rda +ℹ Running domain_mapping in demo mode using package data files +ℹ Using the default look-up table in data/look-up.rda Enter your initials: RS ``` @@ -194,7 +202,9 @@ Would you like to read a description of the table? (y/n): y Enter Y after the prompt to read the description, for the purpose of the demo. -It will now start looping through the data elements. If it skips over one it means it was auto-categorised (more on that later). +You can provide an optional free text note about this table, this will be saved in the log file. + +It will now start looping through the data elements. If it skips over one it means it was auto-categorised or copied from a previous table already processed (more on that later). For this demo, it will only process 20 data elements (out of the 35 total). @@ -258,17 +268,17 @@ Press enter for now. It will then ask you if you want to review the categorisati ``` Would you like to review your categorisations? (y/n): y - DataElement Domain_code Note + DataElement Domain_code Note (first 12 chars) 4 APGAR_1 7 5 APGAR_2 7 -7 BIRTH_ORDER 7 10% missingness -8 BIRTH_TM 1,7 20% missingness +7 BIRTH_ORDER 7 10% missingness +8 BIRTH_TM 1,7 20% missingness 9 BIRTH_WEIGHT 7 10 BIRTH_WEIGHT_DEC 7 11 BREASTFEED_8_WKS_FLG 7 12 BREASTFEED_BIRTH_FLG 7 13 CHILD_ID_E 2 -14 CURR_LHB_CD_BIRTH 5,7 Place of birth +14 CURR_LHB_CD_BIRTH 5,7 Place of birth 15 DEL_CD 7 16 DOD 3,7 17 ETHNIC_GRP_CD 3 @@ -290,11 +300,11 @@ All finished! Take a look at the outputs: ``` ✔ Your final categorisations have been saved: -OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05_14-37-36.csv +OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05-14-37-36.csv ✔ Your session log has been saved: -LOG_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05_14-37-36.csv +LOG_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05-14-37-36.csv ✔ A summary plot has been saved: -PLOT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05_14-37-36.png +PLOT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05-14-37-36.png ``` The OUTPUT csv contains the categorisations you made. The LOG csv contains information about the session as a whole, including various metadata. @@ -303,34 +313,78 @@ These two csv files contain the same timestamp column. The PLOT png file saves a ### Using your own input files ```r -domain_mapping(json_file, domain_file, look_up_file) +domain_mapping(json_file, domain_file, look_up_file, output_dir, table_copy) ``` This code is in early development. To see known bugs or sub-optimal features refer to the [Issues](https://github.com/aim-rsf/browseMetadata/issues). -Run the code the same as the demo, using your own input files. +First, change the json file and domain file inputs. Later, consider changing the other 3 inputs, depending on your use-case. For example: -It will ask you to specify the range of variables you want to process (start variable:end variable), because you can choose to process a table across multiple sessions (particularly useful if the table has a large number of data elements). +```r +domain_mapping(json_file = 'path/your-json.json', domain_file = 'path/your-domains.csv') +``` -The json file: +Unlike in demo mode, it will ask you to specify the range of variables you want to process (start variable:end variable), because you can choose to process a table across multiple sessions (particularly useful if the table has a large number of data elements). +#### json file: - contains metadata about datasets of interest - downloaded from the metadata catalogue - see [data-raw/national_community_child_health_database_(ncchd)_20240405T130125.json](data-raw/national_community_child_health_database_(ncchd)_20240405T130125.json) for an example download -The domain_file: - +#### domain_file: - a csv file created by the user, with each domain listed on a separate line, no header - see [data-raw/domain_list_demo.csv](data-raw/domain_list_demo.csv) for a template - the first 4 domains will be auto populated (see demo above) -The lookup file: - +#### lookup file: - a [default lookup file](dataraw/look_up.csv) is used by the domain_mapping function - optional: a csv can be created by the user (using the same format as the default) and provided as the input - the lookup file makes auto-categorisations intended for variables that come up regularly in health datasets (e.g. IDs and demographics) - the lookup file only works for 1:1 mappings right now, i.e. the DataElement should only be listed once in the lookup file + #### output dir: + - the path to the directory where the two csv output files will be saved. By default, the current working directory is used + +#### table_copy: +- default is TRUE, so set this to FALSE if you want to deactivate table copying +- table copying means that the categorisations you make for the last table you processed will be carried over to this table, as long as the csv files share an output_dir +- this can be useful because the same data elements (variables) appear across multiple tables within one dataset +- copying from one table to the next will save the user time, and ensure consistency of categorisations across tables +- the 'Note' column in the output csv file will indicate that the categorisation has been copied and where from +- a typical session could look like this: + +*Run 1, select table 'EXAM'* + +``` + domain_mapping() + +ℹ Running domain_mapping in demo mode using package data files +ℹ Using the default look-up table in data/look-up.rda +``` + +*Run 2, select table 'CHILD' (the function notices we have already run the table 'EXAM')* + +``` + domain_mapping() + +ℹ Running domain_mapping in demo mode using package data files +ℹ Using the default look-up table in data/look-up.rda +ℹ Copying from previous session: OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_EXAM_[datetime].csv +``` + +*Run 3, select table 'REFR_IMM_VAC' (the function notices we have already run the table 'CHILD')* + +``` + domain_mapping() + +ℹ Running domain_mapping in demo mode using package data files +ℹ Using the default look-up table in data/look-up.rda +ℹ Copying from previous session: OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_[datetime].csv +``` + +*And so on ...* Each run has the potential to be shorter for the user to complete because if there are the same data elements that appear across tables, the user will not be asked to categorise them twice. + + ### Potential use-cases for the output files The csv output file containing the categorisation for each data element could be used as an input in later analysis steps to filter variables and visualise how each variable maps to research domains of interest. diff --git a/man/compare_sessions.Rd b/man/compare_sessions.Rd index 568114dd..6f718a34 100644 --- a/man/compare_sessions.Rd +++ b/man/compare_sessions.Rd @@ -15,9 +15,9 @@ compare_sessions( \arguments{ \item{session_dir}{This directory should contain 2 csv files for each session (LOG_ and OUTPUT_), 4 csv files in total.} -\item{session1_base}{Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-05_16-07-38.599493'} +\item{session1_base}{Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-05-16-07-38'} -\item{session2_base}{Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-08_12-03-30.429336'} +\item{session2_base}{Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-08-12-03-30'} \item{json_file}{The full path to the metadata file used when running domain_mapping (should be the same for session 1 and session 2)} diff --git a/man/domain_mapping.Rd b/man/domain_mapping.Rd index ef5c391f..74adc116 100644 --- a/man/domain_mapping.Rd +++ b/man/domain_mapping.Rd @@ -8,7 +8,8 @@ domain_mapping( json_file = NULL, domain_file = NULL, look_up_file = NULL, - output_dir = NULL + output_dir = NULL, + table_copy = TRUE ) } \arguments{ @@ -18,16 +19,18 @@ domain_mapping( \item{look_up_file}{The look-up table file, with auto-categorisations. By default, the code uses 'data/look-up.rda'. The user can provide their own look-up table in the same format as 'data-raw/look-up.csv'.} -\item{output_dir}{The path to the directory where the csv output log will be saved. By default, the current working directory is used.} +\item{output_dir}{The path to the directory where the two csv output files will be saved. By default, the current working directory is used.} + +\item{table_copy}{Turn on copying between tables (TRUE or FALSE, default TRUE). If TRUE, categorisations you make for the last table you processed will be carried over to another, as long as the csv files share an output_dir.} } \value{ -The function will return a log file with the mapping between data elements and domains, alongside details about the dataset. +The function will return two csv files: 'OUTPUT_' which contains the mappings and 'LOG_' which contains details about the dataset and session. } \description{ This function will read in the metadata file for a chosen dataset, loop through all the data elements, and ask the user to catergorise/label each data element as belonging to one or more domains.\cr \cr The domains will appear in the Plots tab and dataset information will be printed to the R console, for the user's reference in making these categorisations. \cr \cr -A log file will be saved with the catergorisations made. -To speed up this process, some auto-categorisations will be made by the function for commonly occurring data elements. \cr \cr +These categorisations will be saved to a csv file, alongside a log file which summarises the session details. +To speed up this process, some auto-categorisations will be made by the function for commonly occurring data elements and categorisations for the same data element can be copied from one table to another. \cr \cr Example inputs are provided within the package data, for the user to run this function in a demo mode. } \examples{