-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update 2025 input data #66
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,11 @@ schema: '2.0' | |
stages: | ||
ingest: | ||
cmd: Rscript pipeline/00-ingest.R | ||
deps: | ||
- path: pipeline/00-ingest.R | ||
hash: md5 | ||
md5: 29292ee2bef109914c423c9259aa8879 | ||
size: 22847 | ||
Comment on lines
+5
to
+9
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [Question, non-blocking] I haven't seen us use this pattern before, what's the goal of making the script a dependency? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used in both model repos now, with the thinking being that the pipeline should be restarted if the script changes. I'm not super tied to this setup though if you don't like it. I agree it's a bit weird. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that makes sense, thanks! |
||
params: | ||
params.yaml: | ||
assessment: | ||
|
@@ -26,24 +31,24 @@ stages: | |
outs: | ||
- path: input/assessment_data.parquet | ||
hash: md5 | ||
md5: 605ee612ff45dca2edf5c508993a7f56 | ||
size: 69522635 | ||
md5: b49601e8a812659026c7358d84f5e16b | ||
size: 85702121 | ||
- path: input/char_data.parquet | ||
hash: md5 | ||
md5: ed7b8f4ed02eb491d0450920874a66c3 | ||
size: 131476800 | ||
md5: d1a30dd51db2985be57548c1498f2533 | ||
size: 160972976 | ||
- path: input/condo_strata_data.parquet | ||
hash: md5 | ||
md5: 0a7462f0afccb09bdd94c58148a3ca8d | ||
size: 40842 | ||
md5: 8fe86e0af29431ecb021f101f79789ee | ||
size: 40481 | ||
- path: input/land_nbhd_rate_data.parquet | ||
hash: md5 | ||
md5: e508daf5790982c303d6503fe1cb8e2b | ||
size: 4413 | ||
md5: f3ec9627322bd271bf2957b7388aaa34 | ||
size: 3873 | ||
- path: input/training_data.parquet | ||
hash: md5 | ||
md5: 51090aa4f5b5311b1441e62b81fd3827 | ||
size: 68987740 | ||
md5: 9b2510ac885e4fc77928661a012d8821 | ||
size: 79812730 | ||
train: | ||
cmd: Rscript pipeline/01-train.R | ||
deps: | ||
|
@@ -872,8 +877,8 @@ stages: | |
- loc_school_elementary_district_geoid | ||
- loc_school_secondary_district_geoid | ||
- loc_school_unified_district_geoid | ||
run_note: "Test run for updated 2024 model pipeline. Remove CCAO collected | ||
characteristics.\n" | ||
run_note: "Test run for updated 2024 model pipeline. Remove CCAO collected\ | ||
\ characteristics.\n" | ||
toggle: | ||
cv_enable: false | ||
shap_enable: false | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,9 @@ suppressPackageStartupMessages({ | |
library(noctua) | ||
}) | ||
|
||
# Adds arrow support to speed up ingest process | ||
noctua_options(unload = TRUE) | ||
|
||
Comment on lines
+22
to
+24
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We hadn't yet switched to using the |
||
# Establish Athena connection | ||
AWS_ATHENA_CONN_NOCTUA <- dbConnect( | ||
noctua::athena(), | ||
|
@@ -139,8 +142,8 @@ col_type_dict <- ccao::vars_dict %>% | |
drop_na(var_name) | ||
|
||
# Mini-function to ensure that columns are the correct type | ||
recode_column_type <- function(col, col_name, dict = col_type_dict) { | ||
col_type <- dict %>% | ||
recode_column_type <- function(col, col_name, dictionary = col_type_dict) { | ||
col_type <- dictionary %>% | ||
filter(var_name == col_name) %>% | ||
pull(var_type) | ||
|
||
|
@@ -214,6 +217,30 @@ rescale <- function(x, min = 0, max = 1) { | |
} | ||
|
||
|
||
# Mini function to deal with arrays | ||
# Some Athena columns are stored as arrays but are converted to string on | ||
# ingest. In such cases, we either keep the contents of the cell (if 1 unit), | ||
# collapse the array into a comma-separated string (if more than 1 unit), | ||
# or replace with NA if the array is empty | ||
process_array_columns <- function(data, selector) { | ||
data %>% | ||
mutate( | ||
across( | ||
!!enquo(selector), | ||
~ sapply(.x, function(cell) { | ||
if (length(cell) > 1) { | ||
paste(cell, collapse = ", ") | ||
} else if (length(cell) == 1) { | ||
as.character(cell) # Convert the single element to character | ||
} else { | ||
NA # Handle cases where the array is empty | ||
} | ||
}) | ||
) | ||
) | ||
} | ||
|
||
|
||
|
||
|
||
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
|
@@ -283,7 +310,7 @@ training_data_clean <- training_data_fil %>% | |
# Recode factor variables using the definitions stored in ccao::vars_dict | ||
# This will remove any categories not stored in the dictionary and convert | ||
# them to NA (useful since there are a lot of misrecorded variables) | ||
ccao::vars_recode(cols = starts_with("char_"), type = "code") %>% | ||
ccao::vars_recode(cols = starts_with("char_"), code_type = "code") %>% | ||
# Coerce columns to the data types recorded in the dictionary. Necessary | ||
# because the SQL drivers will often coerce types on pull (boolean becomes | ||
# character) | ||
|
@@ -324,10 +351,13 @@ training_data_clean <- training_data_fil %>% | |
) %>% | ||
# Some Athena columns are stored as arrays but are converted to string on | ||
# ingest. In such cases, take the first element and clean the string | ||
# Apply the helper function to process array columns | ||
process_array_columns(starts_with("loc_tax_")) %>% | ||
mutate( | ||
loc_tax_municipality_name = | ||
replace_na(loc_tax_municipality_name, "UNINCORPORATED") | ||
) %>% | ||
mutate( | ||
across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")), | ||
across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))), | ||
across(starts_with("loc_tax_"), \(x) na_if(x, "")), | ||
# Miscellanous column-level cleanup | ||
ccao_is_corner_lot = replace_na(ccao_is_corner_lot, FALSE), | ||
ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L), | ||
|
@@ -377,16 +407,19 @@ training_data_clean <- training_data_fil %>% | |
# used on. The cleaning steps are the same as above, with the exception of the | ||
# time variables | ||
assessment_data_clean <- assessment_data %>% | ||
ccao::vars_recode(cols = starts_with("char_"), type = "code") %>% | ||
ccao::vars_recode(cols = starts_with("char_"), code_type = "code") %>% | ||
# Apply the helper function to process array columns | ||
process_array_columns(starts_with("loc_tax_")) %>% | ||
mutate( | ||
loc_tax_municipality_name = | ||
replace_na(loc_tax_municipality_name, "UNINCORPORATED") | ||
) %>% | ||
mutate(across( | ||
any_of(col_type_dict$var_name), | ||
~ recode_column_type(.x, cur_column()) | ||
)) %>% | ||
# Same Athena string cleaning and feature cleanup as the training data | ||
mutate( | ||
across(starts_with("loc_tax_"), \(x) str_replace_all(x, "\\[|\\]", "")), | ||
across(starts_with("loc_tax_"), \(x) str_trim(str_split_i(x, ",", 1))), | ||
across(starts_with("loc_tax_"), \(x) na_if(x, "")), | ||
ccao_is_active_exe_homeowner = replace_na(ccao_is_active_exe_homeowner, 0L), | ||
ccao_n_years_exe_homeowner = replace_na(ccao_n_years_exe_homeowner, 0L), | ||
across(where(is.character), \(x) na_if(x, "")), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This will be our 2024 data setup until we get 2024 sales and data sorted out. I'll dvc push once this is merged.