diff --git a/pipeline/02-assess.R b/pipeline/02-assess.R index 4dd8a83..f17b30d 100644 --- a/pipeline/02-assess.R +++ b/pipeline/02-assess.R @@ -41,19 +41,31 @@ lgbm_final_full_recipe <- readRDS(paths$output$workflow_recipe$local) # Load the data for assessment. This is the universe of condo units # that need values. Use the trained lightgbm model to estimate a single # FMV per unit -tic() assessment_data_pred <- read_parquet(paths$input$assessment$local) %>% as_tibble() %>% - # Bake the recipe once and store the result in a variable - { baked_data <- bake(lgbm_final_full_recipe, new_data = .); + # Bake the data first and extract meta strata columns + { baked_data <- bake(lgbm_final_full_recipe, new_data = ., all_predictors()); mutate( ., - pred_card_initial_fmv = predict(lgbm_final_full_fit, new_data = baked_data)$.pred, + pred_card_initial_fmv = as.numeric(predict(lgbm_final_full_fit, new_data = baked_data)$.pred), new_strata_1 = baked_data$meta_strata_1, new_strata_2 = baked_data$meta_strata_2 ) } +mapping_1 <- assessment_data_pred %>% + select(new_strata_1, meta_strata_1) %>% + distinct() %>% + filter(!is.na(meta_strata_1)) %>% + arrange(new_strata_1) + +mapping_2 <- assessment_data_pred %>% + select(new_strata_2, meta_strata_2) %>% + distinct() %>% + filter(!is.na(meta_strata_2)) %>% + arrange(new_strata_2) + + # The imputing stage does not change any existing values. However, for the lightgbm # model, values have to be encoded to a base of 0. Because of this, the output does # not match the input. Since these are 1:1 matches, we map the new column to the old. @@ -64,7 +76,7 @@ strata_mapping_2 <- setNames(mapping_2$meta_strata_2, mapping_2$new_strata_2) assessment_data_pred <- assessment_data_pred %>% mutate( # Binary variable to identify condos which have imputed strata - meta_strata_imputed = ifelse(is.na(meta_strata_1), 1, 0), + meta_strata_is_imputed = ifelse(is.na(meta_strata_1), 1, 0), # Use mappings to replace meta_strata_1 and meta_strata_2 directly meta_strata_1 = strata_mapping_1[as.character(new_strata_1)], meta_strata_2 = strata_mapping_2[as.character(new_strata_2)] @@ -72,8 +84,6 @@ assessment_data_pred <- assessment_data_pred %>% select(-c("new_strata_1", "new_strata_2")) toc() - - #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 3. Post-Modeling Adjustments ------------------------------------------------- #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -171,8 +181,8 @@ assessment_data_merged %>% select( meta_year, meta_pin, meta_class, meta_card_num, meta_lline_num, meta_modeling_group, ends_with("_num_sale"), pred_card_initial_fmv, - all_of(params$model$predictor$all), township_code, meta_strata_1, meta_strata_2, - meta_strata_is_imputed + all_of(params$model$predictor$all), + meta_strata_is_imputed, township_code ) %>% mutate( ccao_n_years_exe_homeowner = as.integer(ccao_n_years_exe_homeowner)