From e6c8805c78a4f9e60f1f31fc8259c42b5b5cbfc8 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 17 Dec 2024 17:20:44 +0000 Subject: [PATCH 1/4] Add maing changes --- reports/performance/_outliers.qmd | 61 +++++++++++++++++-------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/reports/performance/_outliers.qmd b/reports/performance/_outliers.qmd index c5fbc8f..8e1fe7d 100644 --- a/reports/performance/_outliers.qmd +++ b/reports/performance/_outliers.qmd @@ -18,27 +18,29 @@ exclusively on these price-based factors that determine outlier status. ### Count by Type ```{r _outliers_type_breakdown} -# Make new column for a few cells of the outliers section +# Subset important outlier information training_data <- training_data %>% - mutate(price_outlier_reason = case_when( - # Also grabs outlier reason Non-livable computed in 00-ingest.R - str_detect(sv_outlier_reason1, regex("price|Non-livable", ignore_case = TRUE)) ~ sv_outlier_reason1, - str_detect(sv_outlier_reason2, regex("price", ignore_case = TRUE)) ~ sv_outlier_reason2, - str_detect(sv_outlier_reason3, regex("price", ignore_case = TRUE)) ~ sv_outlier_reason3, - TRUE ~ NA_character_ - )) + mutate( + outlier_reasons_to_graph = if_else( + sv_is_outlier, + paste0(sv_outlier_reason1, " (", sv_outlier_reason2, ")"), + NA_character_ + ) + ) %>% + # Yank NA string values to make output cleaner + mutate(outlier_reasons_to_graph = str_remove_all(outlier_reasons_to_graph, " \\(NA\\)")) # Determine the axis limit y_lim_axis_outlier_breakdown <- training_data %>% filter(sv_is_outlier) %>% - count(price_outlier_reason) %>% + count(outlier_reasons_to_graph) %>% summarise(max_value = max(n)) %>% pull(max_value) training_data %>% filter(sv_is_outlier) %>% - count(price_outlier_reason) %>% - ggplot(aes(x = reorder(price_outlier_reason, -n), y = n)) + + count(outlier_reasons_to_graph) %>% + ggplot(aes(x = reorder(outlier_reasons_to_graph, -n), y = n)) + geom_bar(stat = "identity") + geom_text(aes(label = comma(n)), vjust = -0.5) + ylim(0, 1.03 * y_lim_axis_outlier_breakdown) + @@ -57,19 +59,19 @@ training_data %>% ```{r _outliers_type_breakdown_table} training_data %>% filter(sv_is_outlier) %>% - group_by(meta_year, price_outlier_reason) %>% + group_by(meta_year, outlier_reasons_to_graph) %>% summarise(n = n()) %>% rename(Year = meta_year) %>% - pivot_wider(id_cols = Year, names_from = price_outlier_reason, values_from = n) %>% + pivot_wider(id_cols = Year, names_from = outlier_reasons_to_graph, values_from = n) %>% kable() %>% kable_styling("striped") training_data %>% filter(sv_is_outlier) %>% - group_by(meta_year, price_outlier_reason) %>% + group_by(meta_year, outlier_reasons_to_graph) %>% summarise(n = n(), .groups = "drop") %>% rename(Year = meta_year) %>% - pivot_wider(id_cols = Year, names_from = price_outlier_reason, values_from = n) %>% + pivot_wider(id_cols = Year, names_from = outlier_reasons_to_graph, values_from = n) %>% kable() %>% kable_styling("striped") ``` @@ -186,7 +188,7 @@ training_data %>% ```{r _outliers_dist_township, fig.height=8, fig.width=7} training_data %>% - filter(meta_triad_name == run_triad) %>% + filter(meta_triad_name == "North") %>% mutate( township_name = ccao::town_convert(meta_township_code), Category = ifelse(sv_is_outlier, "Outlier", "Not Outlier"), @@ -214,7 +216,7 @@ training_data %>% ```{r _outliers_dist_class, fig.height=8, fig.width=7} training_data %>% filter( - meta_triad_name == run_triad, + meta_triad_name == "North", !meta_class %in% c("218", "219") ) %>% mutate( @@ -262,6 +264,11 @@ outliers_ratio_comparison <- training_data %>% ) %>% distinct(meta_township_name, percent, above_below, triad) +axis_limit_outlier_ratio_comparison <- outliers_ratio_comparison %>% + arrange(desc(percent)) %>% + Matrix::head(1) %>% + pull(percent) + outliers_ratio_comparison %>% ggplot(aes(x = reorder(meta_township_name, percent), y = percent)) + labs( @@ -270,7 +277,7 @@ outliers_ratio_comparison %>% geom_bar(stat = "identity", aes(fill = above_below)) + coord_flip() + geom_text(aes(label = round(percent, 2)), size = 3.2, hjust = -0.2) + - scale_y_continuous(limits = c(0, 1.5)) + + scale_y_continuous(limits = c(0, 1.1 * axis_limit_outlier_ratio_comparison)) + theme_minimal() + theme( axis.title.y = element_blank(), @@ -292,7 +299,7 @@ outliers_ratio_comparison %>% # This object is joined to itself using different filters, which is why this # filtering is applied here rather than below. outliers_table_township_summary <- training_data %>% - filter(meta_class != "200" & meta_triad_name == run_triad) + filter(meta_class != "200" & meta_triad_name == "North") outliers_table_township_summary <- outliers_table_township_summary %>% filter(sv_is_outlier) %>% @@ -301,7 +308,7 @@ outliers_table_township_summary <- outliers_table_township_summary %>% `Med. Sale Price` = median(meta_sale_price, na.rm = TRUE), `Max. Sale Price` = max(meta_sale_price, na.rm = TRUE), Count = n(), - .by = c(price_outlier_reason, meta_township_name) + .by = c(outlier_reasons_to_graph, meta_township_name) ) %>% left_join( outliers_table_township_summary %>% @@ -315,7 +322,7 @@ outliers_table_township_summary <- outliers_table_township_summary %>% mutate(across(contains("Sale"), dollar)) %>% relocate(meta_township_name) %>% dplyr::rename( - "Outlier Type" = price_outlier_reason, + "Outlier Type" = outlier_reasons_to_graph, "Township Name" = meta_township_name ) %>% arrange(`Township Name`, desc(Count)) @@ -337,7 +344,7 @@ outliers_table_township_summary %>% ```{r _outliers_table_class_summary} outliers_table_class_summary <- training_data %>% - filter(meta_class != "200" & meta_triad_name == run_triad) + filter(meta_class != "200" & meta_triad_name == "North") outliers_table_class_summary <- outliers_table_class_summary %>% filter(sv_is_outlier) %>% @@ -346,7 +353,7 @@ outliers_table_class_summary <- outliers_table_class_summary %>% `Med. Sale Price` = median(meta_sale_price, na.rm = TRUE), `Max. Sale Price` = max(meta_sale_price, na.rm = TRUE), Count = n(), - .by = c(price_outlier_reason, meta_class) + .by = c(outlier_reasons_to_graph, meta_class) ) %>% left_join( outliers_table_class_summary %>% @@ -360,7 +367,7 @@ outliers_table_class_summary <- outliers_table_class_summary %>% mutate(across(contains("Sale"), dollar)) %>% relocate(meta_class) %>% dplyr::rename( - "Outlier Type" = price_outlier_reason, + "Outlier Type" = outlier_reasons_to_graph, "Class" = meta_class ) %>% arrange(`Class`, desc(Count)) @@ -431,7 +438,7 @@ outlier_decile_breakout <- function(data, dec) { arrange(meta_sale_price) %>% mutate(decile = ntile(meta_sale_price, 10)) %>% filter(decile == dec & sv_is_outlier) %>% - group_by(price_outlier_reason) %>% + group_by(outlier_reasons_to_graph) %>% summarise(count = n()) %>% ungroup() %>% slice_max(count, n = 1) %>% @@ -441,8 +448,8 @@ outlier_decile_breakout <- function(data, dec) { arrange(meta_sale_price) %>% mutate(decile = ntile(meta_sale_price, 10)) %>% filter(decile == dec & sv_is_outlier) %>% - summarise(count = n(), .by = price_outlier_reason) %>% - ggplot(aes(x = reorder(price_outlier_reason, -count), y = count)) + + summarise(count = n(), .by = outlier_reasons_to_graph) %>% + ggplot(aes(x = reorder(outlier_reasons_to_graph, -count), y = count)) + labs( y = "Number of Sales", x = "Outlier Types" From 432629155b6dee3d89d06ec6bea660d711f2453b Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 17 Dec 2024 17:29:30 +0000 Subject: [PATCH 2/4] Remove hard coded triad refs --- reports/performance/_outliers.qmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/reports/performance/_outliers.qmd b/reports/performance/_outliers.qmd index 8e1fe7d..9d720e9 100644 --- a/reports/performance/_outliers.qmd +++ b/reports/performance/_outliers.qmd @@ -188,7 +188,7 @@ training_data %>% ```{r _outliers_dist_township, fig.height=8, fig.width=7} training_data %>% - filter(meta_triad_name == "North") %>% + filter(meta_triad_name == run_triad") %>% mutate( township_name = ccao::town_convert(meta_township_code), Category = ifelse(sv_is_outlier, "Outlier", "Not Outlier"), @@ -216,7 +216,7 @@ training_data %>% ```{r _outliers_dist_class, fig.height=8, fig.width=7} training_data %>% filter( - meta_triad_name == "North", + meta_triad_name == run_triad", !meta_class %in% c("218", "219") ) %>% mutate( @@ -299,7 +299,7 @@ outliers_ratio_comparison %>% # This object is joined to itself using different filters, which is why this # filtering is applied here rather than below. outliers_table_township_summary <- training_data %>% - filter(meta_class != "200" & meta_triad_name == "North") + filter(meta_class != "200" & meta_triad_name == run_triad") outliers_table_township_summary <- outliers_table_township_summary %>% filter(sv_is_outlier) %>% @@ -344,7 +344,7 @@ outliers_table_township_summary %>% ```{r _outliers_table_class_summary} outliers_table_class_summary <- training_data %>% - filter(meta_class != "200" & meta_triad_name == "North") + filter(meta_class != "200" & meta_triad_name == run_triad") outliers_table_class_summary <- outliers_table_class_summary %>% filter(sv_is_outlier) %>% From 9bddac9270ff7a888d2a5e9770328219e0f7c92d Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 17 Dec 2024 17:34:03 +0000 Subject: [PATCH 3/4] Remove quotes --- reports/performance/_outliers.qmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/reports/performance/_outliers.qmd b/reports/performance/_outliers.qmd index 9d720e9..15c8183 100644 --- a/reports/performance/_outliers.qmd +++ b/reports/performance/_outliers.qmd @@ -188,7 +188,7 @@ training_data %>% ```{r _outliers_dist_township, fig.height=8, fig.width=7} training_data %>% - filter(meta_triad_name == run_triad") %>% + filter(meta_triad_name == run_triad) %>% mutate( township_name = ccao::town_convert(meta_township_code), Category = ifelse(sv_is_outlier, "Outlier", "Not Outlier"), @@ -216,7 +216,7 @@ training_data %>% ```{r _outliers_dist_class, fig.height=8, fig.width=7} training_data %>% filter( - meta_triad_name == run_triad", + meta_triad_name == run_triad, !meta_class %in% c("218", "219") ) %>% mutate( @@ -299,7 +299,7 @@ outliers_ratio_comparison %>% # This object is joined to itself using different filters, which is why this # filtering is applied here rather than below. outliers_table_township_summary <- training_data %>% - filter(meta_class != "200" & meta_triad_name == run_triad") + filter(meta_class != "200" & meta_triad_name == run_triad) outliers_table_township_summary <- outliers_table_township_summary %>% filter(sv_is_outlier) %>% @@ -344,7 +344,7 @@ outliers_table_township_summary %>% ```{r _outliers_table_class_summary} outliers_table_class_summary <- training_data %>% - filter(meta_class != "200" & meta_triad_name == run_triad") + filter(meta_class != "200" & meta_triad_name == run_triad) outliers_table_class_summary <- outliers_table_class_summary %>% filter(sv_is_outlier) %>% From a385af5149b522271a6f16fcadb9ffbaf1dd342d Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 17 Dec 2024 20:16:21 +0000 Subject: [PATCH 4/4] Incorporate feedback --- reports/performance/_outliers.qmd | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/reports/performance/_outliers.qmd b/reports/performance/_outliers.qmd index 15c8183..2332dd4 100644 --- a/reports/performance/_outliers.qmd +++ b/reports/performance/_outliers.qmd @@ -21,14 +21,13 @@ exclusively on these price-based factors that determine outlier status. # Subset important outlier information training_data <- training_data %>% mutate( - outlier_reasons_to_graph = if_else( - sv_is_outlier, - paste0(sv_outlier_reason1, " (", sv_outlier_reason2, ")"), - NA_character_ + outlier_reasons_to_graph = case_when( + sv_is_outlier & !is.na(sv_outlier_reason2) ~ + paste0(sv_outlier_reason1, " (", sv_outlier_reason2, ")"), + sv_is_outlier & is.na(sv_outlier_reason2) ~ sv_outlier_reason1, + TRUE ~ NA_character_ ) - ) %>% - # Yank NA string values to make output cleaner - mutate(outlier_reasons_to_graph = str_remove_all(outlier_reasons_to_graph, " \\(NA\\)")) + ) # Determine the axis limit y_lim_axis_outlier_breakdown <- training_data %>% @@ -265,8 +264,7 @@ outliers_ratio_comparison <- training_data %>% distinct(meta_township_name, percent, above_below, triad) axis_limit_outlier_ratio_comparison <- outliers_ratio_comparison %>% - arrange(desc(percent)) %>% - Matrix::head(1) %>% + slice_max(percent, n = 1) %>% pull(percent) outliers_ratio_comparison %>%