ccao-data · dfsnow · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024
@@ -137,10 +137,10 @@ select_iterations <- function(tune_results, metric, type = "mean") {
 # to numbers
 var_encode <- function(data,
                        cols = dplyr::everything(),
-                       dict = ccao::vars_dict) {
+                       dictionary = ccao::vars_dict) {
   var <- "var_code"
 
-  dict_long <- dict %>%
+  dict_long <- dictionary %>%
     dplyr::filter(
       .data$var_type == "char" &
         .data$var_data_type == "categorical"

@@ -2,6 +2,11 @@ schema: '2.0'
 stages:
   ingest:
     cmd: Rscript pipeline/00-ingest.R
+    deps:
+    - path: pipeline/00-ingest.R
+      hash: md5
+      md5: 29292ee2bef109914c423c9259aa8879
+      size: 22847
     params:
       params.yaml:
         assessment:
@@ -26,24 +31,24 @@ stages:
     outs:
     - path: input/assessment_data.parquet
       hash: md5
-      md5: 605ee612ff45dca2edf5c508993a7f56
-      size: 69522635
+      md5: b49601e8a812659026c7358d84f5e16b
+      size: 85702121
     - path: input/char_data.parquet
       hash: md5
-      md5: ed7b8f4ed02eb491d0450920874a66c3
-      size: 131476800
+      md5: d1a30dd51db2985be57548c1498f2533
+      size: 160972976
     - path: input/condo_strata_data.parquet
       hash: md5
-      md5: 0a7462f0afccb09bdd94c58148a3ca8d
-      size: 40842
+      md5: 8fe86e0af29431ecb021f101f79789ee
+      size: 40481
     - path: input/land_nbhd_rate_data.parquet
       hash: md5
-      md5: e508daf5790982c303d6503fe1cb8e2b
-      size: 4413
+      md5: f3ec9627322bd271bf2957b7388aaa34
+      size: 3873
     - path: input/training_data.parquet
       hash: md5
-      md5: 51090aa4f5b5311b1441e62b81fd3827
-      size: 68987740
+      md5: 9b2510ac885e4fc77928661a012d8821
+      size: 79812730
   train:
     cmd: Rscript pipeline/01-train.R
     deps:
@@ -872,8 +877,8 @@ stages:
           - loc_school_elementary_district_geoid
           - loc_school_secondary_district_geoid
           - loc_school_unified_district_geoid
-        run_note: "Test run for updated 2024 model pipeline. Remove CCAO collected
-          characteristics.\n"
+        run_note: "Test run for updated 2024 model pipeline. Remove CCAO collected\
+          \ characteristics.\n"
         toggle:
           cv_enable: false
           shap_enable: false

@@ -19,6 +19,9 @@ suppressPackageStartupMessages({
   library(noctua)
 })
 
+# Adds arrow support to speed up ingest process
+noctua_options(unload = TRUE)
+
 # Establish Athena connection
 AWS_ATHENA_CONN_NOCTUA <- dbConnect(
   noctua::athena(),
@@ -139,8 +142,8 @@ col_type_dict <- ccao::vars_dict %>%
   drop_na(var_name)
 
 # Mini-function to ensure that columns are the correct type
-recode_column_type <- function(col, col_name, dict = col_type_dict) {
-  col_type <- dict %>%
+recode_column_type <- function(col, col_name, dictionary = col_type_dict) {
+  col_type <- dictionary %>%
     filter(var_name == col_name) %>%
     pull(var_type)
 
@@ -214,6 +217,30 @@ rescale <- function(x, min = 0, max = 1) {
 }
 
 
+# Mini function to deal with arrays
+# Some Athena columns are stored as arrays but are converted to string on
+# ingest. In such cases, we either keep the contents of the cell (if 1 unit),
+# collapse the array into a comma-separated string (if more than 1 unit),
+# or replace with NA if the array is empty
+process_array_columns <- function(data, selector) {
+  data %>%
+    mutate(
+      across(
+        !!enquo(selector),
+        ~ sapply(.x, function(cell) {
+          if (length(cell) > 1) {
+            paste(cell, collapse = ", ")
+          } else if (length(cell) == 1) {
+            as.character(cell) # Convert the single element to character
+          } else {
+            NA # Handle cases where the array is empty
+          }
+        })
+      )
+    )
+}
+
+
 
 
 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -283,7 +310,7 @@ training_data_clean <- training_data_fil %>%
   # Recode factor variables using the definitions stored in ccao::vars_dict
   # This will remove any categories not stored in the dictionary and convert
   # them to NA (useful since there are a lot of misrecorded variables)
-  ccao::vars_recode(cols = starts_with("char_"), type = "code") %>%
+  ccao::vars_recode(cols = starts_with("char_"), code_type = "code") %>%
   # Coerce columns to the data types recorded in the dictionary. Necessary
   # because the SQL drivers will often coerce types on pull (boolean becomes
   # character)
@@ -324,10 +351,13 @@ training_data_clean <- training_data_fil %>%
   ) %>%
   # Some Athena columns are stored as arrays but are converted to string on
   # ingest. In such cases, take the first element and clean the string
+  # Apply the helper function to process array columns
+  process_array_columns(starts_with("loc_tax_")) %>%
+  mutate(
+    loc_tax_municipality_name =
+      replace_na(loc_tax_municipality_name, "UNINCORPORATED")
+  ) %>%
   mutate(
-    across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")),
-    across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))),
-    across(starts_with("loc_tax_"), \(x) na_if(x, "")),
     # Miscellanous column-level cleanup
     ccao_is_corner_lot = replace_na(ccao_is_corner_lot, FALSE),
     ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L),
@@ -377,16 +407,19 @@ training_data_clean <- training_data_fil %>%
 # used on. The cleaning steps are the same as above, with the exception of the
 # time variables
 assessment_data_clean <- assessment_data %>%
-  ccao::vars_recode(cols = starts_with("char_"), type = "code") %>%
+  ccao::vars_recode(cols = starts_with("char_"), code_type = "code") %>%
+  # Apply the helper function to process array columns
+  process_array_columns(starts_with("loc_tax_")) %>%
+  mutate(
+    loc_tax_municipality_name =
+      replace_na(loc_tax_municipality_name, "UNINCORPORATED")
+  ) %>%
   mutate(across(
     any_of(col_type_dict$var_name),
     ~ recode_column_type(.x, cur_column())
   )) %>%
   # Same Athena string cleaning and feature cleanup as the training data
   mutate(
-    across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")),
-    across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))),
-    across(starts_with("loc_tax_"), \(x) na_if(x, "")),
     ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L),
     ccao_n_years_exe_homeowner = replace_na(ccao_n_years_exe_homeowner, 0L),
     across(where(is.character), \(x) na_if(x, "")),