diff --git a/.Rbuildignore b/.Rbuildignore index e13c4051..0f5a3369 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,9 @@ ^.*\.Rproj$ ^\.Rproj\.user$ ^LICENSE\.md$ +^doc$ +^Meta$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ +^\.github$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 00000000..2d19fc76 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 00000000..ed7650c7 --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,48 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to GitHub pages πŸš€ + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@v4.4.1 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.gitignore b/.gitignore index 43876b2d..41d45466 100644 --- a/.gitignore +++ b/.gitignore @@ -29,4 +29,8 @@ venv.bak/ # Test/Input files input_files/* output_files/* -test_code/* \ No newline at end of file +test_code/* +inst/doc +/doc/ +/Meta/ +docs diff --git a/DESCRIPTION b/DESCRIPTION index 5c6d2bcf..5fea4475 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,3 +24,8 @@ Imports: grid, gridExtra, rjson +Suggests: + knitr, + rmarkdown +VignetteBuilder: knitr +URL: https://aim-rsf.github.io/browseMetadata/ diff --git a/R/data-domain_list.R b/R/data-domain_list.R index 83ae615b..868720e6 100644 --- a/R/data-domain_list.R +++ b/R/data-domain_list.R @@ -11,6 +11,6 @@ #' @usage data(domain_list) #' #' @format A data frame with 5 rows and 1 column -#' +#' #' @source The csv was manually created -"domain_list" \ No newline at end of file +"domain_list" diff --git a/R/data-json_metadata.R b/R/data-json_metadata.R index c5a9b54c..b7ffc7d1 100644 --- a/R/data-json_metadata.R +++ b/R/data-json_metadata.R @@ -3,7 +3,7 @@ #' Example metadata for a health dataset, to demo the function domain_mapping.R \cr \cr #' This data was created with these five steps: #' \enumerate{ -#' \item Go to https://modelcatalogue.cs.ox.ac.uk/hdruk_live/#/catalogue/dataModel/17e86f3f-ec29-4c8e-9efc-8793a74b107d +#' \item Go to https://modelcatalogue.cs.ox.ac.uk/hdruk_live/#/catalogue/dataModel/17e86f3f-ec29-4c8e-9efc-8793a74b107d #' \item Download json metadata file by selecting the 'Export as JSON' option on the download button #' \item \code{install.packages("rjson")} #' \item \code{json_metadata <- rjson::fromJSON(file = '/browseMetadata/data-raw/maternity_indicators_dataset_(mids)_20240105T132210.json')} diff --git a/R/domain_mapping.R b/R/domain_mapping.R index 539131b2..56c26266 100755 --- a/R/domain_mapping.R +++ b/R/domain_mapping.R @@ -1,261 +1,250 @@ #' domain_mapping #' -#'This function will read in the metadata file for a chosen dataset, loop through all the variables, and ask the user to catergorise/label each variable as belonging to one or more domains.\cr \cr -#'The domains will appear in the Plots tab and dataset information will be printed to the R console, for the user's reference in making these categorisations. \cr \cr -#'A log file will be saved with the catergorisations made. -#'To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables; -#'these auto-categorisations should be verified by the user by checking the csv log file. \cr \cr -#'Example inputs are provided within the package data, for the user to run this function in a demo mode. -#'@param json_file The metadata file. This should be downloaded from the metadata catalogue as a json file. See 'data-raw/maternity_indicators_dataset_(mids)_20240105T132210.json' for an example download. -#'@param domain_file The domain list file. This should be a csv file created by the user, with each domain listed on a separate line. See 'data-raw/domain_list_demo.csv' for a template. -#'@return The function will return a log file with the mapping between dataset variables and domains, alongside details about the dataset. -#'@examples -#'# Run in demo mode by providing no inputs: domain_mapping() -#'# Demo mode will use the /data files provided in this package -#'# Respond with your initials when prompted. -#'# Respond 'Demo List ' for the description of domain list. -#'# Respond 'Y' if you want to see the descriptions printed out. -#'# Respond '1,10' to the RANGE OF VARIABLES prompt (or process the full 93 variables if you like!) -#'# Reference the plot tab and categorise each variable into a single ('1') or multiple ('1,2') domain. -#'# Write a note explaining your category choice (optional). -#'@export -#'@importFrom graphics plot.new -#'@importFrom utils read.csv write.csv - -domain_mapping <- function(json_file= NULL,domain_file= NULL) { - - - # Load data: Check if demo data should be used +#' This function will read in the metadata file for a chosen dataset, loop through all the variables, and ask the user to catergorise/label each variable as belonging to one or more domains.\cr \cr +#' The domains will appear in the Plots tab and dataset information will be printed to the R console, for the user's reference in making these categorisations. \cr \cr +#' A log file will be saved with the catergorisations made. +#' To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables; +#' these auto-categorisations should be verified by the user by checking the csv log file. \cr \cr +#' Example inputs are provided within the package data, for the user to run this function in a demo mode. +#' @param json_file The metadata file. This should be downloaded from the metadata catalogue as a json file. See 'data-raw/maternity_indicators_dataset_(mids)_20240105T132210.json' for an example download. +#' @param domain_file The domain list file. This should be a csv file created by the user, with each domain listed on a separate line. See 'data-raw/domain_list_demo.csv' for a template. +#' @return The function will return a log file with the mapping between dataset variables and domains, alongside details about the dataset. +#' @examples +#' # Run in demo mode by providing no inputs: domain_mapping() +#' # Demo mode will use the /data files provided in this package +#' # Respond with your initials when prompted. +#' # Respond 'Demo List ' for the description of domain list. +#' # Respond 'Y' if you want to see the descriptions printed out. +#' # Respond '1,10' to the RANGE OF VARIABLES prompt (or process the full 93 variables if you like!) +#' # Reference the plot tab and categorise each variable into a single ('1') +#' # or multiple ('1,2') domain. +#' # Write a note explaining your category choice (optional). +#' @export +#' @importFrom graphics plot.new +#' @importFrom utils read.csv write.csv + +domain_mapping <- function(json_file = NULL, domain_file = NULL) { + # Load data: Check if demo data should be used if (is.null(json_file) && is.null(domain_file)) { # If both json_file and domain_file are NULL, use demo data - meta_json <- get('json_metadata') - domains <- get('domain_list') - DomainListDesc <- 'DemoList' - cat('\n') - cli_alert_info('Running domain_mapping in demo mode using package data files') + meta_json <- get("json_metadata") + domains <- get("domain_list") + DomainListDesc <- "DemoList" + cat("\n") + cli_alert_info("Running domain_mapping in demo mode using package data files") } else if (is.null(json_file) || is.null(domain_file)) { # If only one of json_file and domain_file is NULL, throw error - cat('\n') - cli_alert_danger('Please provide both json_file and domain_file (or neither file, to run in demo mode)') + cat("\n") + cli_alert_danger("Please provide both json_file and domain_file (or neither file, to run in demo mode)") stop() } else { # Read in the json file containing the meta data meta_json <- rjson::fromJSON(file = json_file) # Read in the domain file containing the meta data - domains <- read.csv(domain_file,header = FALSE) + domains <- read.csv(domain_file, header = FALSE) DomainListDesc <- tools::file_path_sans_ext(basename(domain_file)) } # Present domains plots panel for user's reference ---- graphics::plot.new() - domains_extend <- rbind(c('*NO MATCH / UNSURE*'),c('*METADATA*'), c('*ALF ID*'),c('*OTHER ID*'),c('*DEMOGRAPHICS*'),domains) - gridExtra::grid.table(domains_extend[1],cols='Domain',rows=0:(nrow(domains_extend)-1)) + domains_extend <- rbind(c("*NO MATCH / UNSURE*"), c("*METADATA*"), c("*ALF ID*"), c("*OTHER ID*"), c("*DEMOGRAPHICS*"), domains) + gridExtra::grid.table(domains_extend[1], cols = "Domain", rows = 0:(nrow(domains_extend) - 1)) # Get user and demo list info for log file ---- User_Initials <- "" while (User_Initials == "") { cat("\n \n") - User_Initials <- readline(prompt="ENTER INITIALS: ") + User_Initials <- readline(prompt = "ENTER INITIALS: ") } # Print information about Data Asset ---- cli_h1("Data Asset Name") - cat(meta_json$dataModel$label,fill=TRUE) + cat(meta_json$dataModel$label, fill = TRUE) cli_h1("Data Asset Last Updated") - cat(meta_json$dataModel$lastUpdated,fill=TRUE) + cat(meta_json$dataModel$lastUpdated, fill = TRUE) cli_h1("Data Asset File Exported By") - cat(meta_json$exportMetadata$exportedBy, "at", meta_json$exportMetadata$exportedOn,fill=TRUE) + cat(meta_json$exportMetadata$exportedBy, "at", meta_json$exportMetadata$exportedOn, fill = TRUE) nDataClasses <- length(meta_json$dataModel$childDataClasses) - cat('\n') + cat("\n") cli_alert_info("Found {nDataClasses} Data Class{?es} ({nDataClasses} table{?s}) in this Data Asset") - cat('\n') + cat("\n") dataasset_desc <- "" while (dataasset_desc != "Y" & dataasset_desc != "N") { cat("\n \n") - dataasset_desc <- readline(prompt="Would you like to read a description of the Data Asset? (Y/N) ") + dataasset_desc <- readline(prompt = "Would you like to read a description of the Data Asset? (Y/N) ") } if (dataasset_desc == "Y") { cli_h1("Data Asset Description") - cat(meta_json$dataModel$description,fill=TRUE) - readline(prompt="Press [enter] to proceed") + cat(meta_json$dataModel$description, fill = TRUE) + readline(prompt = "Press [enter] to proceed") } # Extract each DataClass (Table) for (dc in 1:nDataClasses) { - cat('\n') + cat("\n") cli_alert_info("Processing Data Class (Table) {dc} of {nDataClasses}") cli_h1("Data Class Name") - cat(meta_json$dataModel$childDataClasses[[dc]]$label,fill=TRUE) + cat(meta_json$dataModel$childDataClasses[[dc]]$label, fill = TRUE) cli_h1("Data Class Last Updated") - cat(meta_json$dataModel$childDataClasses[[dc]]$lastUpdated,'\n',fill=TRUE) + cat(meta_json$dataModel$childDataClasses[[dc]]$lastUpdated, "\n", fill = TRUE) dataclass_desc <- "" while (dataclass_desc != "Y" & dataclass_desc != "N") { cat("\n \n") - dataclass_desc <- readline(prompt="Would you like to read a description of the Data Class (Table)? (Y/N) ") + dataclass_desc <- readline(prompt = "Would you like to read a description of the Data Class (Table)? (Y/N) ") } if (dataclass_desc == "Y") { cli_h1("Data Class Description") - cat(meta_json$dataModel$childDataClasses[[dc]]$description,fill=TRUE) - readline(prompt="Press [enter] to proceed") + cat(meta_json$dataModel$childDataClasses[[dc]]$description, fill = TRUE) + readline(prompt = "Press [enter] to proceed") } thisDataClass <- meta_json$dataModel$childDataClasses[[dc]]$childDataElements # probably a better way of dealing with complex json files in R ... - thisDataClass_df <- data.frame(do.call(rbind,thisDataClass)) # nested list to dataframe - dataType_df <- data.frame(do.call(rbind,thisDataClass_df$dataType)) # nested list to dataframe + thisDataClass_df <- data.frame(do.call(rbind, thisDataClass)) # nested list to dataframe + dataType_df <- data.frame(do.call(rbind, thisDataClass_df$dataType)) # nested list to dataframe - selectDataClass_df <- data.frame (Label = unlist(thisDataClass_df$label), - Description = unlist(thisDataClass_df$description), - Type = unlist(dataType_df$label) + selectDataClass_df <- data.frame( + Label = unlist(thisDataClass_df$label), + Description = unlist(thisDataClass_df$description), + Type = unlist(dataType_df$label) ) - selectDataClass_df <- selectDataClass_df[order(selectDataClass_df$Label),] + selectDataClass_df <- selectDataClass_df[order(selectDataClass_df$Label), ] # Create unique output csv to log the results ---- - timestamp_now <- gsub(" ", "_",Sys.time()) - timestamp_now <- gsub(":", "-",timestamp_now) - - output_fname <- paste0("LOG_",gsub(" ", "",meta_json$dataModel$label),'_',gsub(" ", "",meta_json$dataModel$childDataClasses[[dc]]$label),'_',timestamp_now,".csv") - - Output <- data.frame(Initials = c(""), - MetaDataVersion = c(""), - MetaDataLastUpdated = c(""), - DomainListDesc = c(""), - DataAsset = c(""), - DataClass = c(""), - DataElement = c(""), - Domain_code = c(""), - Note = c("") + timestamp_now <- gsub(" ", "_", Sys.time()) + timestamp_now <- gsub(":", "-", timestamp_now) + + output_fname <- paste0("LOG_", gsub(" ", "", meta_json$dataModel$label), "_", gsub(" ", "", meta_json$dataModel$childDataClasses[[dc]]$label), "_", timestamp_now, ".csv") + + Output <- data.frame( + Initials = c(""), + MetaDataVersion = c(""), + MetaDataLastUpdated = c(""), + DomainListDesc = c(""), + DataAsset = c(""), + DataClass = c(""), + DataElement = c(""), + Domain_code = c(""), + Note = c("") ) # User inputs ---- cat("\n \n") - select_vars_n <- readline(prompt="RANGE OF VARIABLES (DATA ELEMENTS) TO PROCESS (write as 'start_var,end_var' or press Enter to process all): ") + select_vars_n <- readline(prompt = "RANGE OF VARIABLES (DATA ELEMENTS) TO PROCESS (write as 'start_var,end_var' or press Enter to process all): ") if (select_vars_n == "") { start_var <- 1 end_var <- length(thisDataClass) } else { - seperate_vars <- unlist(strsplit(select_vars_n,",")) + seperate_vars <- unlist(strsplit(select_vars_n, ",")) start_var <- as.numeric(seperate_vars[1]) end_var <- as.numeric(seperate_vars[2]) } # Loop through each variable, request response from the user to match to a domain ---- - for (datavar in start_var:end_var ) { - + for (datavar in start_var:end_var) { # auto categorise (full string and partial string matches) if (selectDataClass_df$Label[datavar] == "NA") { - - Output [ nrow(Output) + 1 , ] <- NA + Output[nrow(Output) + 1, ] <- NA Output$DataElement[datavar] Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- '0' - Output$Note[datavar] <- 'AUTO CATEGORISED' - + Output$Domain_code[datavar] <- "0" + Output$Note[datavar] <- "AUTO CATEGORISED" } else if (selectDataClass_df$Label[datavar] == "AVAIL_FROM_DT") { - - Output [ nrow(Output) + 1 , ] <- NA + Output[nrow(Output) + 1, ] <- NA Output$DataElement[datavar] Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- '1' - Output$Note[datavar] <- 'AUTO CATEGORISED' - - } else if ((selectDataClass_df$Label[datavar] == "ALF_E") - || (selectDataClass_df$Label[datavar] == "RALF") - || (selectDataClass_df$Label[datavar] == "ALF_STS_CD") - || (selectDataClass_df$Label[datavar] == "ALF_MTCH_PCT") - || (grepl("_ALF_E", selectDataClass_df$Label[datavar],ignore.case = TRUE)) # grepl because of MOTHER_ALF_E and CHILD_ALF_E etc. - || (grepl("_RALF", selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("_ALF_STS_CD", selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("_ALF_MTCH_PCT", selectDataClass_df$Label[datavar],ignore.case = TRUE))) - { - Output [ nrow(Output) + 1 , ] <- NA + Output$Domain_code[datavar] <- "1" + Output$Note[datavar] <- "AUTO CATEGORISED" + } else if ((selectDataClass_df$Label[datavar] == "ALF_E") || + (selectDataClass_df$Label[datavar] == "RALF") || + (selectDataClass_df$Label[datavar] == "ALF_STS_CD") || + (selectDataClass_df$Label[datavar] == "ALF_MTCH_PCT") || + (grepl("_ALF_E", selectDataClass_df$Label[datavar], ignore.case = TRUE)) # grepl because of MOTHER_ALF_E and CHILD_ALF_E etc. + || (grepl("_RALF", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("_ALF_STS_CD", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("_ALF_MTCH_PCT", selectDataClass_df$Label[datavar], ignore.case = TRUE))) { + Output[nrow(Output) + 1, ] <- NA Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- '2' - Output$Note[datavar] <- 'AUTO CATEGORISED' + Output$Domain_code[datavar] <- "2" + Output$Note[datavar] <- "AUTO CATEGORISED" + } else if (grepl("_ID_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) { # picking up generic IDs - } else if (grepl("_ID_", selectDataClass_df$Label[datavar],ignore.case = TRUE)) { # picking up generic IDs - - Output [ nrow(Output) + 1 , ] <- NA + Output[nrow(Output) + 1, ] <- NA Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- '3' - Output$Note[datavar] <- 'AUTO CATEGORISED' - + Output$Domain_code[datavar] <- "3" + Output$Note[datavar] <- "AUTO CATEGORISED" } else if ((selectDataClass_df$Label[datavar] == "AGE") # likely to be a better way to code this section with fewer lines - || (selectDataClass_df$Label[datavar] == "DOB") - || (selectDataClass_df$Label[datavar] == "WOB") - || (selectDataClass_df$Label[datavar] == "SEX") - || (selectDataClass_df$Label[datavar] == "GENDER") - || (selectDataClass_df$Label[datavar] == "GNDR") - || (grepl("_AGE",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("_DOB",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("_WOB",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("_SEX",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("_GENDER",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("_GNDR",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("AGE_",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("DOB_",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("WOB_",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("SEX_",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("GENDER_",selectDataClass_df$Label[datavar],ignore.case = TRUE)) - || (grepl("GNDR_",selectDataClass_df$Label[datavar],ignore.case = TRUE))) - { - Output [ nrow(Output) + 1 , ] <- NA + || (selectDataClass_df$Label[datavar] == "DOB") || + (selectDataClass_df$Label[datavar] == "WOB") || + (selectDataClass_df$Label[datavar] == "SEX") || + (selectDataClass_df$Label[datavar] == "GENDER") || + (selectDataClass_df$Label[datavar] == "GNDR") || + (grepl("_AGE", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("_DOB", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("_WOB", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("_SEX", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("_GENDER", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("_GNDR", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("AGE_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("DOB_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("WOB_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("SEX_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("GENDER_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || + (grepl("GNDR_", selectDataClass_df$Label[datavar], ignore.case = TRUE))) { + Output[nrow(Output) + 1, ] <- NA Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- '4' - Output$Note[datavar] <- 'AUTO CATEGORISED' - + Output$Domain_code[datavar] <- "4" + Output$Note[datavar] <- "AUTO CATEGORISED" } else { - # user response - cat(paste("\nDATA ELEMENT -----> ",selectDataClass_df$Label[datavar], - "\n\nDESCRIPTION -----> ",selectDataClass_df$Description[datavar], - "\n\nDATA TYPE -----> ",selectDataClass_df$Type[datavar],"\n")) + cat(paste( + "\nDATA ELEMENT -----> ", selectDataClass_df$Label[datavar], + "\n\nDESCRIPTION -----> ", selectDataClass_df$Description[datavar], + "\n\nDATA TYPE -----> ", selectDataClass_df$Type[datavar], "\n" + )) decision <- "" while (decision == "") { cat("\n \n") - decision <- readline(prompt="CATEGORISE THIS VARIABLE (input a comma seperated list of domain numbers): ") + decision <- readline(prompt = "CATEGORISE THIS VARIABLE (input a comma seperated list of domain numbers): ") } decision_note <- "" while (decision_note == "") { cat("\n \n") - decision_note <- readline(prompt="NOTES (write 'N' if no notes): ") + decision_note <- readline(prompt = "NOTES (write 'N' if no notes): ") } - Output [ nrow(Output) + 1 , ] <- NA + Output[nrow(Output) + 1, ] <- NA Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] Output$Domain_code[datavar] <- decision Output$Note[datavar] <- decision_note - } - } # Fill in columns that have all rows identical - Output$Initials = User_Initials - Output$MetaDataVersion = meta_json$dataModel$documentationVersion - Output$MetaDataLastUpdated = meta_json$dataModel$lastUpdated - Output$DomainListDesc = DomainListDesc - Output$DataAsset = meta_json$dataModel$label - Output$DataClass = meta_json$dataModel$childDataClasses[[dc]]$label + Output$Initials <- User_Initials + Output$MetaDataVersion <- meta_json$dataModel$documentationVersion + Output$MetaDataLastUpdated <- meta_json$dataModel$lastUpdated + Output$DomainListDesc <- DomainListDesc + Output$DataAsset <- meta_json$dataModel$label + Output$DataClass <- meta_json$dataModel$childDataClasses[[dc]]$label # Save file & print the responses to be saved - Output[Output == ''] <- NA - utils::write.csv(Output, output_fname, row.names=FALSE) #save as we go in case session terminates prematurely + Output[Output == ""] <- NA + utils::write.csv(Output, output_fname, row.names = FALSE) # save as we go in case session terminates prematurely cat("\n") cli_alert_info("The below responses will be saved to {output_fname}") cat("\n") - print(Output[,c("DataClass","DataElement","Domain_code","Note")]) + print(Output[, c("DataClass", "DataElement", "Domain_code", "Note")]) } cat("\n \n") cli_alert_warning("Please check the auto categorised data elements are accurate!") cli_alert_warning("Manually edit csv file to correct errors, if needed.") } - diff --git a/README.md b/README.md index 50266c3d..57ef3376 100644 --- a/README.md +++ b/README.md @@ -1,97 +1,151 @@ +--- +editor_options: + markdown: + wrap: 72 +--- + # About `browseMetadata` + -[![All Contributors](https://img.shields.io/badge/all_contributors-3-orange.svg?style=flat-square)](#contributors-) - -This `R` package was created to help a researcher browse the health datasets in the [SAIL databank](https://saildatabank.com). It is intended to be useful in the *earlier* stages of a project, where datasets are being scoped out. When a research team has not yet got access to the data they can still browse the metadata, and start to address such questions as: +[![All +Contributors](https://img.shields.io/badge/all_contributors-3-orange.svg?style=flat-square)](#contributors-) + [![Lifecycle: +experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) + +This `R` package was created to help a researcher browse the health +datasets in the [SAIL databank](https://saildatabank.com). It is +intended to be useful in the *earlier* stages of a project, where +datasets are being scoped out. When a research team has not yet got +access to the data they can still browse the metadata, and start to +address such questions as: :question: what datasets are available? :question: what datasets do I need for my research question? -:question: which variables within these datasets map onto my research domains of interest? (e.g. socioeconomic factors, childhood adverse events, medical diagnoses, culture and community) +:question: which variables within these datasets map onto my research +domains of interest? (e.g. socioeconomic factors, childhood adverse +events, medical diagnoses, culture and community) ## What does the R package do? -This `R` package is a planning tool, designed to be used alongside other tools and sources of information about health datasets for research. - -If a researcher wants to access datasets within SAIL databank, how do they know which variables will represent the concepts they care about for their research question? For many health datasets, including SAIL, the metadata is publicly available. This `R` package uses the [Health Data Research Gateway](https://web.www.healthdatagateway.org/search?search=&datasetSort=latest&tab=Datasets) and the connected [Metadata Catalogue](https://modelcatalogue.cs.ox.ac.uk/hdruk_live/). This `R` package has a function which takes a metadata file as input and facilitates the process of browsing through each dataset and variable. The user is asked to categorise each variable into a domain related to their research question, and these categorisations get saved in a csv file for later reference. To speed up this process, the function automatically categorises some variables that regularly appear in health datasets (e.g. ID, Sex, Age). - -🚧 :warning: This package is in early development, and has only been tested on a limited number of metadata files. In theory, this package should work for **any dataset listed on the Health Data Research Gateway (not just SAIL)** as long as a json metadata file can be downloaded. In practice, it has only been tested on a limited number of metadata files for SAIL databank. - -## Getting started with metadata - -There are many existing tools that allow you to browse metadata for health datasets. These are listed in the [RESOURCES.md](RESOURCES.md) file in this repository. :bulb: These tools may be sufficient for you to address the example questions listed above. +This `R` package is a planning tool, designed to be used alongside other +tools and sources of information about health datasets for research. + +If a researcher wants to access datasets within SAIL databank, how do +they know which variables will represent the concepts they care about +for their research question? For many health datasets, including SAIL, +the metadata is publicly available. This `R` package uses the [Health +Data Research +Gateway](https://web.www.healthdatagateway.org/search?search=&datasetSort=latest&tab=Datasets) +and the connected [Metadata +Catalogue](https://modelcatalogue.cs.ox.ac.uk/hdruk_live/). This `R` +package has a function which takes a metadata file as input and +facilitates the process of browsing through each dataset and variable. +The user is asked to categorise each variable into a domain related to +their research question, and these categorisations get saved in a csv +file for later reference. To speed up this process, the function +automatically categorises some variables that regularly appear in health +datasets (e.g. ID, Sex, Age). + +🚧 :warning: This package is in early development, and has only been +tested on a limited number of metadata files. In theory, this package +should work for **any dataset listed on the Health Data Research Gateway +(not just SAIL)** as long as a json metadata file can be downloaded. In +practice, it has only been tested on a limited number of metadata files +for SAIL databank. + +## Getting started with metadata + +There are many existing tools that allow you to browse metadata for +health datasets. These are listed in the [RESOURCES.md](RESOURCES.md) +file in this repository. :bulb: These tools may be sufficient for you to +address the example questions listed above. ## Getting started with this `R` package `browseMetadata` -### Install +### Install Run in the R console: -```r +``` r install.packages("devtools") devtools::install_github("aim-rsf/browseMetadata") ``` -### Example run through +### Example run through + Execute `?domain_mapping` in the R console to read the documentation. -Execute `domain_mapping()` in the R console to run this function in demo mode. Follow the example in the documentation. +Execute `domain_mapping()` in the R console to run this function in demo +mode. Follow the example in the documentation. -For demo mode, you do not need to provide your own input files. It will use the package data. +For demo mode, you do not need to provide your own input files. It will +use the package data. -Remember to reference the Plots tab in R. The domains will appear in the Plot tab and give you the necessary context for the categorisations. +Remember to reference the Plots tab in R. The domains will appear in the +Plot tab and give you the necessary context for the categorisations. -When using your own inputs, take note that these domain categories will be added to your domain list by default: -- NO MATCH / UNSURE -- METADATA -- ALF ID -- OTHER ID -- DEMOGRAPHICS +When using your own inputs, take note that these domain categories will +be added to your domain list by default: - NO MATCH / UNSURE - +METADATA - ALF ID - OTHER ID - DEMOGRAPHICS ### The log file output -Running the function will output a log file with your decisions. An example log file output is shown below (left) with the demo domain list that was used to create it (right). The name of the log file will contain the date and time stamp, as well as Data Class and Data Asset. The log file will contain initials of the person making the catergorisations, as well as metadata about the dataset. For each Data Element (variable) in the DataClass, the log file will contain a 'Domain_code' which labels this variable as mapping onto one or more of the domains of interest. Notice that some have been auto categorised - double check them for accuracy. More than one domain is allowed to map onto each variable. +Running the function will output a log file with your decisions. An +example log file output is shown below (left) with the demo domain list +that was used to create it (right). The name of the log file will +contain the date and time stamp, as well as Data Class and Data Asset. +The log file will contain initials of the person making the +catergorisations, as well as metadata about the dataset. For each Data +Element (variable) in the DataClass, the log file will contain a +'Domain_code' which labels this variable as mapping onto one or more of +the domains of interest. Notice that some have been auto categorised - +double check them for accuracy. More than one domain is allowed to map +onto each variable. -logfile-ex +![](https://github-production-user-asset-6210df.s3.amazonaws.com/50215726/268979307-4e2ded4f-f425-418c-b0bc-9a9cec7c6fe7.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240110T150116Z&X-Amz-Expires=300&X-Amz-Signature=e3f02943c068a130dbb6a58e5e17d22afc5425c9235055e73fc9b688ea670c52&X-Amz-SignedHeaders=host&actor_id=53487593&key_id=0&repo_id=675673962) -The idea would be that this log file could be loaded up, compared across users, and used as an input in later analysis steps when working out which variables can be used to represent which domains. +The idea would be that this log file could be loaded up, compared across +users, and used as an input in later analysis steps when working out +which variables can be used to represent which domains. ## License -This project is licensed under the GNU General Public License v3.0 - see the [LICENSE](LICENSE) file for details. +This project is licensed under the GNU General Public License v3.0 - see +the [LICENSE](LICENSE) file for details. -The GNU General Public License is a free, copyleft license for software and other kinds of works. For more information, please refer to . +The GNU General Public License is a free, copyleft license for software +and other kinds of works. For more information, please refer to +. -## Contributing changes +## Contributing changes -You can contribute changes to this repository via submitting an Issue to request a change, or create a Pull Request with your direct changes. +You can contribute changes to this repository via submitting an Issue to +request a change, or create a Pull Request with your direct changes. If you are working on changes to the R package: To create the .rda files in the data directory of the package: `usethis::use_data(dataname)` -To view the package data: -`data(package='browseMetadata')` +To view the package data: `data(package='browseMetadata')` -To load the package data: -`data(dataname)` +To load the package data: `data(dataname)` -To build the documentation files: -`library(roxygen2)` -`roxygenise()` +To build the documentation files: `library(roxygen2)` `roxygenise()` ## Citation To cite package β€˜browseMetadata’ in publications use: -> Stickland R (2024). browseMetadata: Browses available metadata, to catergorise/label each variable in a dataset. R package version 0.1.0. +> Stickland R (2024). browseMetadata: Browses available metadata, to +> catergorise/label each variable in a dataset. R package version 0.1.0. A BibTeX entry for LaTeX users is -``` +``` @Manual{, title = {browseMetadata: Browses available metadata, to catergorise/label each variable in a dataset}, author = {Rachael Stickland}, @@ -100,24 +154,48 @@ A BibTeX entry for LaTeX users is } ``` - ### Contributors ✨ -This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification, using the ([emoji key](https://allcontributors.org/docs/en/emoji-key)). Contributions of any kind welcome! + +This project follows the +[all-contributors](https://github.com/all-contributors/all-contributors) +specification, using the ([emoji +key](https://allcontributors.org/docs/en/emoji-key)). Contributions of +any kind welcome! + + - - - - - - - - -
Rachael Stickland
Rachael Stickland

πŸ–‹ πŸ“– 🚧 πŸ€”
Batool Almarzouq
Batool Almarzouq

πŸ““ πŸ‘€ πŸ€”
Mahwish Mohammad
Mahwish Mohammad

πŸ““
+ ++----------------------+----------------------+----------------------+ +| [![Rachael | [![Batool | [![Mahwish | +| Stickland](https | Almarzouq](http | Mohammad](http | +| ://avatars.githubuse | s://avatars.githubus | s://avatars.githubus | +| rcontent.com/u/50215 | ercontent.com/u/5348 | ercontent.com/u/4392 | +| 726?v=4?s=100){alt=" | 7593?v=4?s=100){alt= | 6907?v=4?s=100){alt= | +| Rachael Stickland"}\ | "Batool Almarzouq"}\ | "Mahwish Mohammad"}\ | +| ~**Ra | ~**B | ~**Mahwish\ Mohamma | +| chael\ Stickland**~] | atool\ Almarzouq**~] | d**~](https://github | +| (http://linkedin.com | (https://batool-alma | .com/Rainiefantasy)\ | +| /in/rstickland-phd)\ | rzouq.netlify.app/)\ | [πŸ““](#u | +| [πŸ–‹](#content | [πŸ““ | serTesting-Rainiefan | +| -RayStick "Content") | ](#userTesting-Bato | tasy "User Testing") | +| [πŸ“–](htt | olMM "User Testing") | | +| ps://github.com/aim- | [πŸ‘€](https:/ | | +| rsf/browse-metadata/ | /github.com/aim-rsf/ | | +| commits?author=RaySt | browse-metadata/pull | | +| ick "Documentation") | s?q=is%3Apr+reviewed | | +| [ | -by%3ABatoolMM "Revi | | +| 🚧](#maintenance-Ray | ewed Pull Requests") | | +| Stick "Maintenance") | [πŸ€”](#ideas | | +| [πŸ€”](#ideas | -BatoolMM "Ideas, Pl | | +| -RayStick "Ideas, Pl | anning, & Feedback") | | +| anning, & Feedback") | | | ++----------------------+----------------------+----------------------+ + diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 00000000..03142a44 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,9 @@ +bibentry( + bibtype = "Manual", + title = "browseMetadata: Browses available metadata, to catergorise or +label each variable in a dataset", + author = "Rachael Stickland", + year = 2023, + note = "R package version 0.1.0", + url = "https://github.com/aim-rsf/browseMetadata", +) diff --git a/man/browseMetadata-package.Rd b/man/browseMetadata-package.Rd index 21683270..a0ba61ec 100644 --- a/man/browseMetadata-package.Rd +++ b/man/browseMetadata-package.Rd @@ -7,6 +7,13 @@ \title{browseMetadata} \description{ Browses available metadata, to catergorise/label each variable in a dataset. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://aim-rsf.github.io/browseMetadata/} +} + } \author{ \strong{Maintainer}: Rachael Stickland \email{rstickland@turing.ac.uk} (\href{https://orcid.org/0000-0003-3398-4272}{ORCID}) diff --git a/man/domain_mapping.Rd b/man/domain_mapping.Rd index 9bee1a1a..bf4ae21a 100644 --- a/man/domain_mapping.Rd +++ b/man/domain_mapping.Rd @@ -17,7 +17,7 @@ The function will return a log file with the mapping between dataset variables a \description{ This function will read in the metadata file for a chosen dataset, loop through all the variables, and ask the user to catergorise/label each variable as belonging to one or more domains.\cr \cr The domains will appear in the Plots tab and dataset information will be printed to the R console, for the user's reference in making these categorisations. \cr \cr -A log file will be saved with the catergorisations made. +A log file will be saved with the catergorisations made. To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables; these auto-categorisations should be verified by the user by checking the csv log file. \cr \cr Example inputs are provided within the package data, for the user to run this function in a demo mode. @@ -29,6 +29,7 @@ Example inputs are provided within the package data, for the user to run this fu # Respond 'Demo List ' for the description of domain list. # Respond 'Y' if you want to see the descriptions printed out. # Respond '1,10' to the RANGE OF VARIABLES prompt (or process the full 93 variables if you like!) -# Reference the plot tab and categorise each variable into a single ('1') or multiple ('1,2') domain. +# Reference the plot tab and categorise each variable into a single ('1') +# or multiple ('1,2') domain. # Write a note explaining your category choice (optional). } diff --git a/man/figures/example-log-file.png b/man/figures/example-log-file.png new file mode 100644 index 00000000..ac15856c Binary files /dev/null and b/man/figures/example-log-file.png differ diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml new file mode 100644 index 00000000..014f5988 --- /dev/null +++ b/pkgdown/_pkgdown.yml @@ -0,0 +1,4 @@ +url: https://aim-rsf.github.io/browseMetadata/ +template: + bootstrap: 5 + diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 00000000..097b2416 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/browseMetadata.Rmd b/vignettes/browseMetadata.Rmd new file mode 100644 index 00000000..edb79d01 --- /dev/null +++ b/vignettes/browseMetadata.Rmd @@ -0,0 +1,100 @@ +--- +title: "Metadata tools and resources" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Metadata tools and resources} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Getting started with metadata + +There are many existing tools and resources that allow you to browse metadata for health datasets, and we list some of them here: + +### [Health Data Research Innovation Gateway](https://web.www.healthdatagateway.org/search?search=&datasetSort=latest&tab=Datasets) and the connected [Metadata Catalogue](https://modelcatalogue.cs.ox.ac.uk/hdruk_live/) + +- The metadata used as input for this `R` package `browseMetadata` + +- It is "managed by Health Data Research UK in collaboration with the UK Health Data Research Alliance" + +- It is "a search-engine or β€˜portal’ to help you find health datasets that exist in the UK" + +- "The datasets that are discoverable through the Gateway are from organisations in the NHS, research institutes and charities, which are part of the UK Health Data Research Alliance" + +A related resource from HDRUK is the [Phenotype Library](https://phenotypes.healthdatagateway.org), "a comprehensive, open access resource providing the research community with information, tools and phenotyping algorithms for UK electronic health records." See also the [Concept Library](https://conceptlibrary.saildatabank.com) developed by the SAIL databank team and collaborating organisations. + +### [British Heart Foundation Data Science Centre (BHF DSC) Dashboard](https://bhf-dsc-hds.shinyapps.io/cvd-covid-tre-dashboard) + +- It offers "an overview and interactive summaries of the datasets currently available through CVD-COVID-UK/COVID-IMPACT within the secure Trusted Research Environments (TREs) provided by NHS England for England, the National Data Safe Haven for Scotland and the SAIL databank for Wales" + +- This dashboard allows you to explore data dictionaries, data coverage and data completeness. + +### [Office for National Statistics (ONS) Secure Research Service (SRS) Metadata Catalogue](https://ons.metadata.works/) + +- Metadata for datasets within the ONS SRS. It is possible to filter for datasets related to 'Health' by clicking this tag on the first page. + +There are more tools and resources out there. If you know of a resource that offers accessible health metadata with good breadth and/or depth of coverage, please request we add it here! + +## Getting Started Guide for `browseMetadata` R Package + +### Installation Instructions + +To install the `browseMetadata` package, follow these steps in your R console: + +1. **Install Devtools Package** + + ``` r + install.packages("devtools") + ``` + +2. **Install `browseMetadata` from GitHub** + + ``` r + devtools::install_github("aim-rsf/browseMetadata") + ``` + +### Running an Example + +To familiarize yourself with `browseMetadata`, here's a quick guide: + +1. **Accessing Documentation** + - Type `?domain_mapping` in the R console to view the function's documentation. +2. **Running a Demo** + - Execute `domain_mapping()` to run the function in demo mode. + - The demo mode uses package data, so no need for your own input files. + - Follow the example provided in the documentation for guidance. +3. **Viewing Output** + - Check the Plots tab in R. The domain mappings will be displayed there, providing context for the categorizations. + +### Using Your Own Inputs + +When using your own data, be aware of these default domain categories: + +- NO MATCH / UNSURE +- METADATA +- ALF ID +- OTHER ID +- DEMOGRAPHICS + +### Understanding the Log File Output + +Each run of the function generates a log file: + +- **Contents**: The log file includes your decisions, the date and time stamp, Data Class, Data Asset, and the initials of the person categorizing. + +- **Structure**: For each Data Element in the DataClass, the log file assigns a 'Domain_code', indicating its domain categorization. + +- **Auto-Categorization**: Some entries are auto-categorized. Please review these for accuracy. Multiple domains can map onto each variable. + +- **Example**: Below is an example log file output (left) alongside the demo domain list used (right). + + ![](images/example-log-file.png) + +- **Usage**: This log file can be used for comparison across users and as an input in later analysis steps to determine variable representation in various domains. diff --git a/vignettes/images/example-log-file.png b/vignettes/images/example-log-file.png new file mode 100644 index 00000000..ac15856c Binary files /dev/null and b/vignettes/images/example-log-file.png differ