Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update training/input data #41

Merged
merged 13 commits into from
Mar 12, 2024
16 changes: 8 additions & 8 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,24 @@ stages:
outs:
- path: input/assessment_data.parquet
hash: md5
md5: 0b9e5b923038497f3a607d6e87496516
size: 69704229
md5: 605ee612ff45dca2edf5c508993a7f56
size: 69522635
- path: input/char_data.parquet
hash: md5
md5: 1d6456f84fb0b40ad2ed5483f8fc4cbd
size: 131100439
md5: ed7b8f4ed02eb491d0450920874a66c3
size: 131476800
- path: input/condo_strata_data.parquet
hash: md5
md5: 0e3a26f7482aac6b194f0b95beb6031c
size: 40733
md5: 0a7462f0afccb09bdd94c58148a3ca8d
size: 40842
- path: input/land_nbhd_rate_data.parquet
hash: md5
md5: e508daf5790982c303d6503fe1cb8e2b
size: 4413
- path: input/training_data.parquet
hash: md5
md5: 8b9014a569e3adae773330ee48e14670
size: 65839056
md5: 51090aa4f5b5311b1441e62b81fd3827
size: 68987740
train:
cmd: Rscript pipeline/01-train.R
deps:
Expand Down
Binary file modified misc/desk_review_template.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion params.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -395,4 +395,4 @@ ratio_study:
# upload
export:
triad_code: "1"
run_id: "2024-02-16-silly-billy"
run_id: "2024-03-11-pensive-manasi"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the new final run ID w SHAPs.

46 changes: 45 additions & 1 deletion pipeline/00-ingest.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,22 @@ training_data <- dbGetQuery(
)
tictoc::toc()

# Raw sales document number data used to identify some sales accidentally
# excluded from the original training runs. See
# https://github.com/ccao-data/data-architecture/pull/334 for more info
tictoc::tic("Sales data pulled")
sales_data <- dbGetQuery(
conn = AWS_ATHENA_CONN_NOCTUA, glue("
SELECT DISTINCT
substr(saledt, 1, 4) AS year,
instruno AS doc_no_old,
NULLIF(REPLACE(instruno, 'D', ''), '') AS doc_no_new
FROM iasworld.sales
WHERE substr(saledt, 1, 4) >= '{params$input$min_sale_year}'
")
)
tictoc::toc()

# Pull all condo PIN input data for the assessment and prior year. We will only
# use the assessment year to run the model, but the prior year can be used for
# report generation
Expand Down Expand Up @@ -234,12 +250,25 @@ training_data_ms <- training_data %>%
filter(!as.logical(as.numeric(ind_pin_is_multilline))) %>%
select(-keep_unit_sale, -total_proration_rate)

# Kludge to add an indicator for later-added sales
training_data_klg <- training_data_ms %>%
left_join(
sales_data %>%
distinct(doc_no_new, .keep_all = TRUE),
by = c("meta_sale_document_num" = "doc_no_new", "year")
) %>%
mutate(
sv_added_later = as.logical(endsWith(doc_no_old, "D")),
sv_added_later = replace_na(sv_added_later, FALSE)
) %>%
select(-doc_no_old)

# Multi-sale outlier detection / sales validation kludge. The main sales
# validation logic cannot yet handle multi-sale properties, but they're a
# significant minority of the total sales sample. We can borrow some
# conservative thresholds from the main sales validation output to identify
# likely non-arms-length sales. ONLY APPLIES to multi-sale properties
training_data_fil <- training_data_ms %>%
training_data_fil <- training_data_klg %>%
mutate(
sv_outlier_type = case_when(
meta_sale_price < 50000 & meta_sale_num_parcels == 2 ~
Expand All @@ -253,6 +282,21 @@ training_data_fil <- training_data_ms %>%
(meta_sale_price > 1700000 & meta_sale_num_parcels == 2),
TRUE,
sv_is_outlier
),
# Kludge sale validation flags based on raw price for sales added later
# due to https://github.com/ccao-data/data-architecture/pull/334
sv_outlier_type = case_when(
meta_sale_price < 40000 & sv_added_later ~
"Low price (raw)",
meta_sale_price > 1500000 & sv_added_later ~
"High price (raw)",
TRUE ~ sv_outlier_type
),
sv_is_outlier = ifelse(
(meta_sale_price < 40000 & sv_added_later) |
(meta_sale_price > 1500000 & sv_added_later),
TRUE,
sv_is_outlier
)
)

Expand Down
5 changes: 3 additions & 2 deletions pipeline/02-assess.R
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ sales_data_two_most_recent <- sales_data %>%
distinct(
meta_pin, meta_year,
meta_sale_price, meta_sale_date, meta_sale_document_num,
sv_outlier_type, meta_sale_num_parcels
sv_outlier_type, meta_sale_num_parcels, sv_added_later
) %>%
# Include outliers, since these data are used for desk review and
# not for modeling
Expand All @@ -233,7 +233,8 @@ sales_data_two_most_recent <- sales_data %>%
meta_sale_price,
meta_sale_document_num,
meta_sale_outlier_type,
meta_sale_num_parcels
meta_sale_num_parcels,
sv_added_later
),
names_glue = "{mr}_{gsub('meta_sale_', '', .value)}"
) %>%
Expand Down
78 changes: 61 additions & 17 deletions pipeline/07-export.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ assessment_pin <- dbGetQuery(
")
)

# Pull prior final model's values for comparison
assessment_pin_old <- dbGetQuery(
conn = AWS_ATHENA_CONN_NOCTUA, glue("
SELECT year, meta_pin, pred_pin_final_fmv_round AS model_org_fmv
FROM model.assessment_pin
WHERE run_id = '2024-02-16-silly-billy'
AND meta_triad_code = '{params$export$triad_code}'
")
)

# Pull card-level data only for all PINs. Needed for upload, since values are
# tracked by card, even though they're presented by PIN
assessment_card <- dbGetQuery(
Expand Down Expand Up @@ -79,6 +89,10 @@ message("Preparing data for Desk Review export")
# Prep data with a few additional columns + put everything in the right
# order for DR sheets
assessment_pin_prepped <- assessment_pin %>%
left_join(
assessment_pin_old,
by = c("year", "meta_pin")
) %>%
mutate(
prior_near_land_rate = round(
prior_near_land / (char_land_sf * meta_tieback_proration_rate),
Expand All @@ -101,9 +115,16 @@ assessment_pin_prepped <- assessment_pin %>%
prior_near_tot <= params$pv$nonlivable_threshold,
0
),
across(
ends_with("added_later") & where(is.logical),
~ as.numeric(.x)
),
# Empty fields to be filled out via other means
char_type_resd = NA,
valuations_note = NA, # Empty notes field for Valuations to fill out
sale_ratio = NA # Initialize as NA so we can fill out with a formula later
valuations_note = NA,
sale_ratio = NA,
model_org_fmv_nom_chg = (pred_pin_final_fmv_round - model_org_fmv),
model_org_fmv_pct_chg = model_org_fmv_nom_chg / model_org_fmv
) %>%
select(
township_code, meta_pin, meta_class, meta_nbhd_code,
Expand All @@ -127,7 +148,9 @@ assessment_pin_prepped <- assessment_pin %>%
flag_common_area, flag_proration_sum_not_1, flag_pin_is_multiland,
flag_land_gte_95_percentile,
flag_land_value_capped, flag_prior_near_to_pred_unchanged,
flag_prior_near_yoy_inc_gt_50_pct, flag_prior_near_yoy_dec_gt_5_pct
flag_prior_near_yoy_inc_gt_50_pct, flag_prior_near_yoy_dec_gt_5_pct,
sale_recent_1_sv_added_later, sale_recent_2_sv_added_later,
model_org_fmv, model_org_fmv_nom_chg, model_org_fmv_pct_chg
) %>%
mutate(
across(starts_with("flag_"), as.numeric),
Expand Down Expand Up @@ -188,7 +211,7 @@ assessment_pin10_prepped <- assessment_pin_prepped %>%
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Write raw data to sheets for parcel details
for (town in unique(assessment_pin_prepped$township_code)) {
for (town in "75") {
message("Now processing: ", town_convert(town))

## 4.1. PIN-Level ------------------------------------------------------------
Expand Down Expand Up @@ -237,7 +260,7 @@ for (town in unique(assessment_pin_prepped$township_code)) {
num_head <- 6
pin_row_range <- (num_head + 1):(nrow(assessment_pin_filtered) + num_head)
pin_row_range_w_header <- c(num_head, pin_row_range)
pin_col_range <- 1:52
pin_col_range <- 1:57

assessment_pin_w_row_ids <- assessment_pin_filtered %>%
tibble::rowid_to_column("row_id") %>%
Expand Down Expand Up @@ -308,7 +331,7 @@ for (town in unique(assessment_pin_prepped$township_code)) {
wb, pin_sheet_name,
style = style_price,
rows = pin_row_range,
cols = c(9:11, 15:18, 23, 28, 33, 51, 52), gridExpand = TRUE
cols = c(9:11, 15:18, 23, 28, 33, 53, 54, 56, 57), gridExpand = TRUE
)
addStyle(
wb, pin_sheet_name,
Expand All @@ -323,7 +346,7 @@ for (town in unique(assessment_pin_prepped$township_code)) {
addStyle(
wb, pin_sheet_name,
style = style_pct,
rows = pin_row_range, cols = c(8, 14, 22, 24), gridExpand = TRUE
rows = pin_row_range, cols = c(8, 14, 22, 24, 55), gridExpand = TRUE
)
addStyle(
wb, pin_sheet_name,
Expand All @@ -338,13 +361,16 @@ for (town in unique(assessment_pin_prepped$township_code)) {
addFilter(wb, pin_sheet_name, 6, pin_col_range)

# Format YoY % change column with a range of colors from low to high
conditionalFormatting(
wb, pin_sheet_name,
cols = c(24),
rows = pin_row_range,
style = c("#F8696B", "#FFFFFF", "#00B0F0"),
rule = c(-1, 0, 1),
type = "colourScale"
walk(
c(24, 55),
~ conditionalFormatting(
wb, pin_sheet_name,
cols = .x,
rows = pin_row_range,
style = c("#F8696B", "#FFFFFF", "#00B0F0"),
rule = c(-1, 0, 1),
type = "colourScale"
)
)
# Format sale such that they are orange for adjusted multi-PIN sales
conditionalFormatting(
Expand Down Expand Up @@ -385,6 +411,24 @@ for (town in unique(assessment_pin_prepped$township_code)) {
type = "expression"
)

# Highlight sales that were later added to the model
conditionalFormatting(
wb, pin_sheet_name,
cols = 27,
rows = pin_row_range,
style = createStyle(bgFill = "#CF91FF"),
rule = "$AY7=1",
type = "expression"
)
conditionalFormatting(
wb, pin_sheet_name,
cols = 32,
rows = pin_row_range,
style = createStyle(bgFill = "#CF91FF"),
rule = "$AZ7=1",
type = "expression"
)

# Write PIN-level data to workbook
writeData(
wb, pin_sheet_name, assessment_pin_filtered,
Expand Down Expand Up @@ -430,18 +474,18 @@ for (town in unique(assessment_pin_prepped$township_code)) {
writeFormula(
wb, pin_sheet_name,
assessment_pin_avs$total_av,
startCol = 51,
startCol = 56,
startRow = 7
)
writeFormula(
wb, pin_sheet_name,
assessment_pin_avs$av_difference,
startCol = 52,
startCol = 57,
startRow = 7
)
setColWidths(
wb, pin_sheet_name,
c(51, 52),
c(56, 57),
widths = 1,
hidden = c(TRUE, TRUE), ignoreMergedCells = FALSE
)
Expand Down
4 changes: 2 additions & 2 deletions renv.lock
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@
"RemoteUsername": "ccao-data",
"RemoteRepo": "ccao",
"RemoteRef": "master",
"RemoteSha": "6bdb2f2cab811c3450d91726a784804bd8bca971",
"RemoteSha": "fe992f9bb6e75c97a0c8fa742106096adbabb642",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just updating the CCAO package to the latest version. It's tangential to this PR.

"Remotes": "ccao-data/assessr",
"Requirements": [
"R",
Expand All @@ -293,7 +293,7 @@
"rlang",
"tidyr"
],
"Hash": "9e1f71bb457d8d8514ca553610fa0bfc"
"Hash": "4f649d828fea30e11c2b65acb3ec778a"
},
"class": {
"Package": "class",
Expand Down
Loading