diff --git a/R/helpers.R b/R/helpers.R index 3f23fd7..e8342f2 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -137,10 +137,10 @@ select_iterations <- function(tune_results, metric, type = "mean") { # to numbers var_encode <- function(data, cols = dplyr::everything(), - dict = ccao::vars_dict) { + dictionary = ccao::vars_dict) { var <- "var_code" - dict_long <- dict %>% + dict_long <- dictionary %>% dplyr::filter( .data$var_type == "char" & .data$var_data_type == "categorical" diff --git a/dvc.lock b/dvc.lock index 18e4bab..62d58f4 100644 --- a/dvc.lock +++ b/dvc.lock @@ -2,6 +2,11 @@ schema: '2.0' stages: ingest: cmd: Rscript pipeline/00-ingest.R + deps: + - path: pipeline/00-ingest.R + hash: md5 + md5: 29292ee2bef109914c423c9259aa8879 + size: 22847 params: params.yaml: assessment: @@ -26,24 +31,24 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: 605ee612ff45dca2edf5c508993a7f56 - size: 69522635 + md5: b49601e8a812659026c7358d84f5e16b + size: 85702121 - path: input/char_data.parquet hash: md5 - md5: ed7b8f4ed02eb491d0450920874a66c3 - size: 131476800 + md5: d1a30dd51db2985be57548c1498f2533 + size: 160972976 - path: input/condo_strata_data.parquet hash: md5 - md5: 0a7462f0afccb09bdd94c58148a3ca8d - size: 40842 + md5: 8fe86e0af29431ecb021f101f79789ee + size: 40481 - path: input/land_nbhd_rate_data.parquet hash: md5 - md5: e508daf5790982c303d6503fe1cb8e2b - size: 4413 + md5: f3ec9627322bd271bf2957b7388aaa34 + size: 3873 - path: input/training_data.parquet hash: md5 - md5: 51090aa4f5b5311b1441e62b81fd3827 - size: 68987740 + md5: 9b2510ac885e4fc77928661a012d8821 + size: 79812730 train: cmd: Rscript pipeline/01-train.R deps: @@ -872,8 +877,8 @@ stages: - loc_school_elementary_district_geoid - loc_school_secondary_district_geoid - loc_school_unified_district_geoid - run_note: "Test run for updated 2024 model pipeline. Remove CCAO collected - characteristics.\n" + run_note: "Test run for updated 2024 model pipeline. Remove CCAO collected\ + \ characteristics.\n" toggle: cv_enable: false shap_enable: false diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 937279d..667a0ee 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -19,6 +19,9 @@ suppressPackageStartupMessages({ library(noctua) }) +# Adds arrow support to speed up ingest process +noctua_options(unload = TRUE) + # Establish Athena connection AWS_ATHENA_CONN_NOCTUA <- dbConnect( noctua::athena(), @@ -139,8 +142,8 @@ col_type_dict <- ccao::vars_dict %>% drop_na(var_name) # Mini-function to ensure that columns are the correct type -recode_column_type <- function(col, col_name, dict = col_type_dict) { - col_type <- dict %>% +recode_column_type <- function(col, col_name, dictionary = col_type_dict) { + col_type <- dictionary %>% filter(var_name == col_name) %>% pull(var_type) @@ -214,6 +217,30 @@ rescale <- function(x, min = 0, max = 1) { } +# Mini function to deal with arrays +# Some Athena columns are stored as arrays but are converted to string on +# ingest. In such cases, we either keep the contents of the cell (if 1 unit), +# collapse the array into a comma-separated string (if more than 1 unit), +# or replace with NA if the array is empty +process_array_columns <- function(data, selector) { + data %>% + mutate( + across( + !!enquo(selector), + ~ sapply(.x, function(cell) { + if (length(cell) > 1) { + paste(cell, collapse = ", ") + } else if (length(cell) == 1) { + as.character(cell) # Convert the single element to character + } else { + NA # Handle cases where the array is empty + } + }) + ) + ) +} + + #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -283,7 +310,7 @@ training_data_clean <- training_data_fil %>% # Recode factor variables using the definitions stored in ccao::vars_dict # This will remove any categories not stored in the dictionary and convert # them to NA (useful since there are a lot of misrecorded variables) - ccao::vars_recode(cols = starts_with("char_"), type = "code") %>% + ccao::vars_recode(cols = starts_with("char_"), code_type = "code") %>% # Coerce columns to the data types recorded in the dictionary. Necessary # because the SQL drivers will often coerce types on pull (boolean becomes # character) @@ -324,10 +351,13 @@ training_data_clean <- training_data_fil %>% ) %>% # Some Athena columns are stored as arrays but are converted to string on # ingest. In such cases, take the first element and clean the string + # Apply the helper function to process array columns + process_array_columns(starts_with("loc_tax_")) %>% + mutate( + loc_tax_municipality_name = + replace_na(loc_tax_municipality_name, "UNINCORPORATED") + ) %>% mutate( - across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")), - across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))), - across(starts_with("loc_tax_"), \(x) na_if(x, "")), # Miscellanous column-level cleanup ccao_is_corner_lot = replace_na(ccao_is_corner_lot, FALSE), ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L), @@ -377,16 +407,19 @@ training_data_clean <- training_data_fil %>% # used on. The cleaning steps are the same as above, with the exception of the # time variables assessment_data_clean <- assessment_data %>% - ccao::vars_recode(cols = starts_with("char_"), type = "code") %>% + ccao::vars_recode(cols = starts_with("char_"), code_type = "code") %>% + # Apply the helper function to process array columns + process_array_columns(starts_with("loc_tax_")) %>% + mutate( + loc_tax_municipality_name = + replace_na(loc_tax_municipality_name, "UNINCORPORATED") + ) %>% mutate(across( any_of(col_type_dict$var_name), ~ recode_column_type(.x, cur_column()) )) %>% # Same Athena string cleaning and feature cleanup as the training data mutate( - across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")), - across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))), - across(starts_with("loc_tax_"), \(x) na_if(x, "")), ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L), ccao_n_years_exe_homeowner = replace_na(ccao_n_years_exe_homeowner, 0L), across(where(is.character), \(x) na_if(x, "")), diff --git a/renv/profiles/dev/renv.lock b/renv/profiles/dev/renv.lock index d96fcea..567adbb 100644 --- a/renv/profiles/dev/renv.lock +++ b/renv/profiles/dev/renv.lock @@ -41,7 +41,7 @@ "Package": "R6", "Version": "2.5.1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R" ], @@ -58,6 +58,16 @@ ], "Hash": "6b868847b365672d6c1677b1608da9ed" }, + "askpass": { + "Package": "askpass", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "sys" + ], + "Hash": "c39f4155b3ceb1a9a2799d700fbd4b6a" + }, "base64enc": { "Package": "base64enc", "Version": "0.1-3", @@ -175,6 +185,27 @@ ], "Hash": "859d96e65ef198fd43e82b9628d593ef" }, + "curl": { + "Package": "curl", + "Version": "6.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "e8ba62486230951fcd2b881c5be23f96" + }, + "data.table": { + "Package": "data.table", + "Version": "1.16.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "38bbf05fc2503143db4c734a7e5cab66" + }, "digest": { "Package": "digest", "Version": "0.6.37", @@ -290,9 +321,24 @@ ], "Hash": "81d371a9cc60640e74e4ab6ac46dcedc" }, + "httr": { + "Package": "httr", + "Version": "1.4.7", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "curl", + "jsonlite", + "mime", + "openssl" + ], + "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" + }, "igraph": { "Package": "igraph", - "Version": "2.1.1", + "Version": "2.1.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -311,7 +357,7 @@ "utils", "vctrs" ], - "Hash": "c03878b48737a0e2da3b772d7b2e22da" + "Hash": "9a93b743b2461ba06ba3b5df12011145" }, "jquerylib": { "Package": "jquerylib", @@ -424,8 +470,13 @@ "noctua": { "Package": "noctua", "Version": "2.6.2", - "Source": "Repository", - "Repository": "CRAN", + "Source": "GitHub", + "RemoteType": "github", + "RemoteHost": "api.github.com", + "RemoteUsername": "DyfanJones", + "RemoteRepo": "noctua", + "RemoteRef": "master", + "RemoteSha": "23a4cfbf537407c7a1547fc13ba771ba2eb098e0", "Requirements": [ "DBI", "R", @@ -436,7 +487,17 @@ "utils", "uuid" ], - "Hash": "c03d73125d695e80b35b4bb3eacf0358" + "Hash": "a48e1decdd027c44ea6b97b0fe0950cb" + }, + "openssl": { + "Package": "openssl", + "Version": "2.2.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "askpass" + ], + "Hash": "d413e0fef796c9401a4419485f709ca1" }, "openxlsx": { "Package": "openxlsx", @@ -455,6 +516,178 @@ ], "Hash": "14304e44a0f90fa2d0f905472333c561" }, + "paws": { + "Package": "paws", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.analytics", + "paws.application.integration", + "paws.common", + "paws.compute", + "paws.cost.management", + "paws.customer.engagement", + "paws.database", + "paws.developer.tools", + "paws.end.user.computing", + "paws.machine.learning", + "paws.management", + "paws.networking", + "paws.security.identity", + "paws.storage" + ], + "Hash": "e86280169e0edf73aeb255dc3291a78c" + }, + "paws.analytics": { + "Package": "paws.analytics", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "0b550ad3a3196182b3972d87f67ee52f" + }, + "paws.application.integration": { + "Package": "paws.application.integration", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "56caa9cc142939976c23d618de875fad" + }, + "paws.common": { + "Package": "paws.common", + "Version": "0.7.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "Rcpp", + "base64enc", + "curl", + "digest", + "httr", + "jsonlite", + "methods", + "stats", + "utils", + "xml2" + ], + "Hash": "d184fea5f7f48426720b84f5b488380d" + }, + "paws.compute": { + "Package": "paws.compute", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "ff7b5df4fe6a395bc007bee9fe66e35b" + }, + "paws.cost.management": { + "Package": "paws.cost.management", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "ec6dc962be2e52a3bc80705453f36498" + }, + "paws.customer.engagement": { + "Package": "paws.customer.engagement", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "c54f1f22133b1c871bdf55eff0696cd4" + }, + "paws.database": { + "Package": "paws.database", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "60369b1a0313bb8f851fce55cf2628f7" + }, + "paws.developer.tools": { + "Package": "paws.developer.tools", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "1ce7155cae73833be217e21521c32e3e" + }, + "paws.end.user.computing": { + "Package": "paws.end.user.computing", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "186cf97ccd324024f711198947276e7a" + }, + "paws.machine.learning": { + "Package": "paws.machine.learning", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "c7f9d15e133abc4a5110a06f9b278bbd" + }, + "paws.management": { + "Package": "paws.management", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "825160604065040b1927448b4ff74ee5" + }, + "paws.networking": { + "Package": "paws.networking", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "9f797402420b714f197129ab3ebc0623" + }, + "paws.security.identity": { + "Package": "paws.security.identity", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "dabf1af10c69094cac1a431f3dc2eb1c" + }, + "paws.storage": { + "Package": "paws.storage", + "Version": "0.7.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "paws.common" + ], + "Hash": "867ac96c9b97e47c23efbf97e809aed4" + }, "pillar": { "Package": "pillar", "Version": "1.9.0", @@ -496,7 +729,7 @@ "Package": "progress", "Version": "1.2.3", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R", "R6", @@ -610,6 +843,13 @@ ], "Hash": "39e1144fd75428983dc3f63aa53dfa91" }, + "sys": { + "Package": "sys", + "Version": "3.4.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "de342ebfebdbf40477d0758d05426646" + }, "tibble": { "Package": "tibble", "Version": "3.2.1", @@ -670,17 +910,27 @@ "Package": "utf8", "Version": "1.2.4", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R" ], "Hash": "62b65c52671e6665f803ff02954446e9" }, + "uuid": { + "Package": "uuid", + "Version": "1.2-1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "34e965e62a41fcafb1ca60e9b142085b" + }, "vctrs": { "Package": "vctrs", "Version": "0.6.5", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R", "cli", @@ -694,7 +944,7 @@ "Package": "vroom", "Version": "1.6.5", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Requirements": [ "R", "bit64", @@ -741,6 +991,19 @@ ], "Hash": "8687398773806cfff9401a2feca96298" }, + "xml2": { + "Package": "xml2", + "Version": "1.3.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "methods", + "rlang" + ], + "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" + }, "yaml": { "Package": "yaml", "Version": "2.3.10", diff --git a/renv/profiles/reporting/renv.lock b/renv/profiles/reporting/renv.lock index cf254a2..ce5304a 100644 --- a/renv/profiles/reporting/renv.lock +++ b/renv/profiles/reporting/renv.lock @@ -65,7 +65,7 @@ }, "Matrix": { "Package": "Matrix", - "Version": "1.7-1", + "Version": "1.6-5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -78,7 +78,7 @@ "stats", "utils" ], - "Hash": "5122bb14d8736372411f955e1b16bc8a" + "Hash": "8c7115cd3a0e048bda2a7cd110549f7a" }, "R6": { "Package": "R6", @@ -111,6 +111,20 @@ ], "Hash": "6b868847b365672d6c1677b1608da9ed" }, + "RcppArmadillo": { + "Package": "RcppArmadillo", + "Version": "14.2.2-1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "methods", + "stats", + "utils" + ], + "Hash": "9da7c242d94a8419d045f6b3a64b9765" + }, "askpass": { "Package": "askpass", "Version": "1.2.1", @@ -930,7 +944,7 @@ }, "lattice": { "Package": "lattice", - "Version": "0.22-6", + "Version": "0.22-5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -941,7 +955,7 @@ "stats", "utils" ], - "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2" + "Hash": "7c5e89f04e72d6611c77451f6331a091" }, "lazyeval": { "Package": "lazyeval", @@ -1098,7 +1112,7 @@ }, "nlme": { "Package": "nlme", - "Version": "3.1-166", + "Version": "3.1-165", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1108,7 +1122,7 @@ "stats", "utils" ], - "Hash": "ccbb8846be320b627e6aa2b4616a2ded" + "Hash": "2769a88be217841b1f33ed469675c3cc" }, "numDeriv": { "Package": "numDeriv", @@ -1698,7 +1712,7 @@ }, "terra": { "Package": "terra", - "Version": "1.7-83", + "Version": "1.8-5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1706,7 +1720,7 @@ "Rcpp", "methods" ], - "Hash": "fbeffe988419d292225a57cf9c284802" + "Hash": "6e3f2c53b58161327fe5787fe1a63d02" }, "tibble": { "Package": "tibble",