-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Include imputed strata in assessment card/pin #55
Changes from all commits
a603001
223aa02
fb520c6
80fd36f
b692b22
6edaaf3
3e1a54b
3e4ab9b
45accc7
84593d4
0837cd2
8ae8d84
aeaa387
d0d8135
0bf170c
75d8d4e
f22d226
92710bf
cfe990b
3eed3df
21ca2ca
77d6cee
2d9628f
b08bdc7
691fa3c
1c4dfe6
ee5666f
85ff9e6
d97a9cf
c2c4d70
efd3bf4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,7 +30,7 @@ land_nbhd_rate <- read_parquet( | |
|
||
|
||
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
# 2. Predict Values ------------------------------------------------------------ | ||
# 2. Predict Values and Recover Strata ---------------------------------------- | ||
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | ||
message("Predicting off-market values with trained model") | ||
|
||
|
@@ -40,20 +40,61 @@ lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local) | |
|
||
# Load the data for assessment. This is the universe of condo units | ||
# that need values. Use the trained lightgbm model to estimate a single | ||
# FMV per unit | ||
# FMV per unit. Bake the data first so we can extract transformed columns | ||
assessment_data_pred <- read_parquet(paths$input$assessment$local) %>% | ||
as_tibble() %>% | ||
as_tibble() | ||
|
||
assessment_data_baked <- assessment_data_pred %>% | ||
bake(lgbm_final_full_recipe, new_data = ., all_predictors()) | ||
|
||
assessment_data_pred <- assessment_data_pred %>% | ||
mutate( | ||
pred_card_initial_fmv = predict( | ||
., | ||
pred_card_initial_fmv = as.numeric(predict( | ||
lgbm_final_full_fit, | ||
new_data = bake( | ||
lgbm_final_full_recipe, | ||
new_data = ., | ||
all_predictors() | ||
) | ||
)$.pred | ||
new_data = assessment_data_baked | ||
)$.pred), | ||
# Strata variables are converted to 0-indexed integers during baking. | ||
# We save those converted values so we can unconvert them below | ||
temp_strata_1 = assessment_data_baked$meta_strata_1, | ||
temp_strata_2 = assessment_data_baked$meta_strata_2 | ||
) | ||
|
||
# The baked data encodes categorical values as base-0 integers. | ||
# However, here we want to recover the original (unencoded) values of our | ||
# strata variables wherever they've been imputed by the baking step. To do so, | ||
# we create a mapping of the encoded to unencoded values and use them to | ||
# recover both the original strata values and those imputed by | ||
# step_impute_knn (in R/recipes.R) | ||
strata_mapping_1 <- assessment_data_pred %>% | ||
filter(!is.na(meta_strata_1)) %>% | ||
distinct(temp_strata_1, meta_strata_1) %>% | ||
pull(meta_strata_1, name = temp_strata_1) | ||
strata_mapping_2 <- assessment_data_pred %>% | ||
filter(!is.na(meta_strata_2)) %>% | ||
distinct(temp_strata_2, meta_strata_2) %>% | ||
pull(meta_strata_2, name = temp_strata_2) | ||
|
||
# Recover the imputed strata values | ||
assessment_data_pred <- assessment_data_pred %>% | ||
mutate( | ||
# Binary variable to identify condos which have imputed strata | ||
flag_strata_is_imputed = is.na(meta_strata_1) | is.na(meta_strata_2), | ||
# Use mappings to replace meta_strata_1 and meta_strata_2 directly | ||
meta_strata_1 = ifelse( | ||
is.na(meta_strata_1), | ||
unname(strata_mapping_1[as.character(temp_strata_1)]), | ||
meta_strata_1 | ||
), | ||
meta_strata_2 = ifelse( | ||
is.na(meta_strata_2), | ||
unname(strata_mapping_2[as.character(temp_strata_2)]), | ||
meta_strata_2 | ||
) | ||
) %>% | ||
# Remove unnecessary columns | ||
select(-temp_strata_1, -temp_strata_2) | ||
|
||
|
||
|
||
|
||
|
@@ -154,14 +195,15 @@ assessment_data_merged %>% | |
select( | ||
meta_year, meta_pin, meta_class, meta_card_num, meta_lline_num, | ||
meta_modeling_group, ends_with("_num_sale"), pred_card_initial_fmv, | ||
all_of(params$model$predictor$all), township_code | ||
all_of(params$model$predictor$all), | ||
flag_strata_is_imputed, township_code | ||
) %>% | ||
mutate( | ||
ccao_n_years_exe_homeowner = as.integer(ccao_n_years_exe_homeowner) | ||
) %>% | ||
ccao::vars_recode( | ||
starts_with("char_"), | ||
type = "long", | ||
cols = starts_with("char_"), | ||
code_type = "long", | ||
as_factor = FALSE | ||
) %>% | ||
write_parquet(paths$output$assessment_card$local) | ||
|
@@ -203,7 +245,7 @@ sales_data_two_most_recent <- sales_data %>% | |
meta_pin, meta_year, | ||
meta_sale_price, meta_sale_date, meta_sale_document_num, | ||
sv_outlier_reason1, sv_outlier_reason2, sv_outlier_reason3, | ||
meta_sale_num_parcels, sv_added_later | ||
meta_sale_num_parcels | ||
) %>% | ||
# Include outliers, since these data are used for desk review and | ||
# not for modeling | ||
|
@@ -225,8 +267,7 @@ sales_data_two_most_recent <- sales_data %>% | |
meta_sale_outlier_reason1, | ||
meta_sale_outlier_reason2, | ||
meta_sale_outlier_reason3, | ||
meta_sale_num_parcels, | ||
sv_added_later | ||
meta_sale_num_parcels | ||
), | ||
names_glue = "{mr}_{gsub('meta_sale_', '', .value)}" | ||
) %>% | ||
|
@@ -270,7 +311,8 @@ assessment_data_pin <- assessment_data_merged %>% | |
meta_year, meta_pin, meta_pin10, meta_triad_code, meta_township_code, | ||
meta_nbhd_code, meta_tax_code, meta_class, meta_tieback_key_pin, | ||
meta_tieback_proration_rate, meta_cdu, meta_modeling_group, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any documentation that needs to be updated with new values in pin / card output files? Strata was never in the pin output file to begin with. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope! This should just get automatically added to the equivalent tables in Athena once it's crawled. |
||
meta_pin_num_landlines, char_yrblt, | ||
meta_pin_num_landlines, meta_strata_1, meta_strata_2, | ||
flag_strata_is_imputed, char_yrblt, | ||
|
||
# Keep overall building square footage | ||
char_total_bldg_sf = char_building_sf, | ||
|
@@ -389,7 +431,7 @@ message("Saving final PIN-level data") | |
assessment_data_pin_final %>% | ||
ccao::vars_recode( | ||
cols = starts_with("char_"), | ||
type = "short", | ||
code_type = "short", | ||
as_factor = FALSE | ||
) %>% | ||
select(-meta_pin10) %>% | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
issue (blocking): This is sort of correct. The imputation will be deterministic between model runs if a seed is set somewhere in this file. However, it won't be deterministic if you run the same prediction twice in a single session (unless you set the seed again).
I would add
set.seed(params$input$strata$seed)
somewhere at the top of this file to ensure that prediction is always using the same seed. Then run the stage twice (run once, restart, run again) and check that the results are the same.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I included this in the file as described, and as setting a global seed. In each of those situations, different iterations still created different FMVs. You can use the uploaded testing file to look into this.