generated from ecohealthalliance/container-template
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix annoying dynamic branching / arrow open_dataset bug. For some rea…
…son branching over ndvi_years caused problems when filtering datasets opened with arrow. No idea why. Fixed by removing dynamic branching from ndvi_transformed target.
- Loading branch information
Showing
9 changed files
with
1,511 additions
and
964 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#' Partition files for DuckDB database | ||
#' | ||
#' This function is designed to partition files for the DuckDB database in R. | ||
#' It takes in the sources of the files, path to store the partitioned files, | ||
#' the template for file naming and the range of years and months. | ||
#' | ||
#' @author Nathan C. Layman | ||
#' | ||
#' @param sources The sources from where the files will be read. | ||
#' @param path The directory where the partitioned files will be stored. Default is "data/explanatory_variables". | ||
#' @param basename_template The template used to name the partitioned files. Default is "explanatory_variables_{.y}_{.m}". | ||
#' @param years The years for which the files need to be partitioned. Default is 2007:2010. | ||
#' @param months The months for which the files need to be partitioned. Default is 1:12. | ||
#' | ||
#' @return A string vector representing the filepath to each partitioned file. | ||
#' | ||
#' @note The function creates a connection with DuckDB database and loads file from each source. Then performs a join operation, next it partitions files | ||
#' based on the selected years and months. The partitioned files are saved in Parquet format with the gzip codec. | ||
#' | ||
#' @examples | ||
#' file_partition_duckdb(sources = list("source_path1","source_path2"), | ||
#' path = 'data/explanatory_variables', basename_template = "explanatory_variables_{.y}_{.m}", | ||
#' years = 2007: 2010, months = 1:12 ) | ||
#' | ||
#' @export | ||
file_partition_duckdb <- function(sources, # A named, nested list of parquet files | ||
path = "data/explanatory_variables", | ||
basename_template = "explanatory_variables_{.y}_{.m}.gz.parquet", | ||
years = 2007:2010, | ||
months = 1:12) { | ||
|
||
# NCL change to branch off of model date for combo | ||
# This approach does work. Only writing complete datasets | ||
# 2005 doesn't have any outbreak history so what do we input? | ||
|
||
file_partitions <- expand.grid(.y = years, .m = months) | ||
|
||
files <- pmap_vec(file_partitions, function(.y, .m) { | ||
|
||
# Create a connect to a DuckDB database | ||
con <- duckdb::dbConnect(duckdb::duckdb()) | ||
|
||
# For each explanatory variable target create a table filtered appropriately | ||
walk2(names(sources), sources, function(table_name, list_of_files) { | ||
|
||
# Prepare the list of files | ||
parquet_list <- glue::glue("SELECT * FROM '{list_of_files}'") | ||
|
||
file_schemas <- map(list_of_files, ~arrow::open_dataset(.x)$schema) | ||
unified_schema <- all(map_vec(file_schemas, ~.x == file_schemas[[1]])) | ||
|
||
# Filter if the type is dynamic to reduce as much as possible the memory footprint | ||
parquet_filter <- c() | ||
if(!is.null(file_schemas[[1]]$year)) parquet_filter <- c(parquet_filter, paste("year ==", .y)) | ||
if(!is.null(file_schemas[[1]]$month)) parquet_filter <- c(parquet_filter, paste("month ==", .m)) | ||
if(length(parquet_filter)) { | ||
parquet_filter <- paste("WHERE", paste(parquet_filter, collapse = " AND ")) | ||
} else { | ||
parquet_filter = "" | ||
} | ||
|
||
parquet_list <- glue::glue("{parquet_list} {parquet_filter}") | ||
|
||
# Check if all schemas are identical | ||
if(unified_schema) { | ||
|
||
# If all schema are identical: union all files | ||
parquet_list <- paste(parquet_list, collapse = " UNION ALL ") | ||
|
||
} else { | ||
|
||
# If not: inner join all files | ||
parquet_list <- glue::glue("({parquet_list})") | ||
parquet_list <- glue::glue("{parquet_list} AS {tools::file_path_sans_ext(basename(list_of_files))}") | ||
parquet_list <- paste0("SELECT * FROM ", paste(parquet_list, collapse = " NATURAL JOIN ")) | ||
} | ||
|
||
# Set up query to add the table to the database | ||
query <- glue::glue("CREATE OR REPLACE TABLE {table_name} AS {parquet_list}") | ||
|
||
# Execute the query | ||
add_table_result <- DBI::dbExecute(con, query) | ||
message(glue::glue("{table_name} table created with {add_table_result} rows")) | ||
}) | ||
|
||
# Establish a file name for the combination of month and year | ||
filename <- file.path(path, glue::glue(basename_template)) | ||
|
||
# Set up a natural inner join for all the tables and output the result to file(s) | ||
query <- glue::glue("COPY (SELECT * FROM {paste(names(sources), collapse = ' NATURAL JOIN ')}) TO '{filename}' (FORMAT 'parquet', CODEC 'gzip', ROW_GROUP_SIZE 100_000)") | ||
|
||
# Execute the join | ||
result <- DBI::dbExecute(con, query) | ||
message(result) | ||
|
||
# Clean up the database connection | ||
duckdb::dbDisconnect(con) | ||
|
||
# Return filename for the list | ||
filename | ||
}) | ||
|
||
files | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.