From 83f321256b3770863354154eff002d3fcc8b3c1d Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 28 Jan 2025 19:59:19 +0000 Subject: [PATCH 01/17] First commit --- params.yaml | 18 ++++ pipeline/00-ingest.R | 248 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+) diff --git a/params.yaml b/params.yaml index 4a62468..3937dda 100644 --- a/params.yaml +++ b/params.yaml @@ -226,6 +226,24 @@ model: - "shp_parcel_num_vertices" - "meta_strata_1" - "meta_strata_2" + - "time_sale_roll_mean_nbhd_sf_t0_w1" + - "time_sale_roll_mean_nbhd_sf_t0_w2" + - "time_sale_roll_mean_nbhd_sf_t0_w3" + - "time_sale_roll_mean_nbhd_sf_t1_w1" + - "time_sale_roll_mean_nbhd_sf_t1_w2" + - "time_sale_roll_mean_nbhd_sf_t1_w3" + - "time_sale_roll_mean_nbhd_sf_t2_w1" + - "time_sale_roll_mean_nbhd_sf_t2_w2" + - "time_sale_roll_mean_nbhd_sf_t2_w3" + - "time_sale_roll_mean_nbhd_condo_t0_w1" + - "time_sale_roll_mean_nbhd_condo_t0_w2" + - "time_sale_roll_mean_nbhd_condo_t0_w3" + - "time_sale_roll_mean_nbhd_condo_t1_w1" + - "time_sale_roll_mean_nbhd_condo_t1_w2" + - "time_sale_roll_mean_nbhd_condo_t1_w3" + - "time_sale_roll_mean_nbhd_condo_t2_w1" + - "time_sale_roll_mean_nbhd_condo_t2_w2" + - "time_sale_roll_mean_nbhd_condo_t2_w3" # List of predictors included in predictor.all which are categoricals. It is # CRITICAL that any categorical variables are included in this list, else diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index da80c41..5f3c17e 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -123,6 +123,44 @@ land_nbhd_rate_data <- dbGetQuery( ) tictoc::toc() +# Pull the single family sales data. It will only be used to construct rolling +# price averages by neighborhood. +tictoc::tic("Single-family sales data pulled") +sf_sales_data <- dbGetQuery( + conn = AWS_ATHENA_CONN_NOCTUA, glue(" + SELECT + sale.doc_no AS meta_sale_document_num, + sale.sale_price AS meta_sale_price, + sale.sale_date AS meta_sale_date, + sale.sv_is_outlier, + res.meta_township_code, + res.meta_nbhd_code + FROM model.vw_card_res_input res + INNER JOIN default.vw_pin_sale sale + ON sale.pin = res.meta_pin + AND sale.year = res.year + WHERE res.year + BETWEEN '{params$input$min_sale_year}' + AND '{params$input$max_sale_year}' + --AND CAST({params$input$max_sale_year} AS int) + AND sale.deed_type IN ('01', '02', '05') + AND NOT sale.is_multisale + AND NOT sale.sale_filter_same_sale_within_365 + AND NOT sale.sale_filter_less_than_10k + AND NOT sale.sale_filter_deed_type + ") +) %>% + # Only exclude explicit outliers from training. Sales with missing validation + # outcomes will be considered non-outliers + mutate( + sv_is_outlier = replace_na(sv_is_outlier, FALSE), + ind_pin_is_multicard = FALSE + ) %>% + # Keep multicard sales since we are only using them to construct sale price + # trends, but we still need the sales sample to be unique by document number + distinct(meta_sale_document_num, .keep_all = TRUE) +tictoc::toc() + # Close connection to Athena dbDisconnect(AWS_ATHENA_CONN_NOCTUA) rm(AWS_ATHENA_CONN_NOCTUA) @@ -401,6 +439,204 @@ training_data_clean <- training_data_fil %>% as_tibble() +# Stack single-family and condo sales data to construct rolling means for both +all_sales_data <- sf_sales_data %>% + mutate(regression_group = "sf") %>% + bind_rows( + training_data_clean %>% + select(meta_sale_document_num, meta_sale_price, meta_sale_date, sv_is_outlier, meta_township_code, meta_nbhd_code) %>% + mutate( + ind_pin_is_multicard = FALSE, + regression_group = "condo" + ) + ) %>% + mutate( + meta_sale_price_sf = ifelse(regression_group == 'sf', meta_sale_price, NA), + meta_sale_price_condo = ifelse(regression_group == 'condo', meta_sale_price, NA), + ) %>% + arrange(meta_sale_date) + +all_sales_data_dt <- all_sales_data[ + !sv_is_outlier & !ind_pin_is_multicard, + `:=`( + lag_nbhd_sf_t0_price = data.table::shift(meta_sale_price_sf, 1, type = "lag"), + lag_nbhd_sf_t1_shift = (1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date)) * 2 - 1, + lag_nbhd_sf_t2_shift = (1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date)) * 2 - 1, + lag_nbhd_condo_t0_price = data.table::shift(meta_sale_price_condo, 1, type = "lag"), + lag_nbhd_condo_t1_shift = (1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date)) * 2 - 1, + lag_nbhd_condo_t2_shift = (1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date)) * 2 - 1 + ), + by = .(meta_nbhd_code) +][ + !sv_is_outlier & !ind_pin_is_multicard, + `:=`( + lag_nbhd_sf_t1_price = meta_sale_price_sf[replace(seq(.N) - lag_nbhd_sf_t1_shift, seq(.N) <= lag_nbhd_sf_t1_shift, NA)], + lag_nbhd_sf_t2_price = meta_sale_price_sf[replace(seq(.N) - lag_nbhd_sf_t2_shift, seq(.N) <= lag_nbhd_sf_t2_shift, NA)], + lag_nbhd_condo_t1_price = meta_sale_price_condo[replace(seq(.N) - lag_nbhd_condo_t1_shift, seq(.N) <= lag_nbhd_condo_t1_shift, NA)], + lag_nbhd_condo_t2_price = meta_sale_price_condo[replace(seq(.N) - lag_nbhd_condo_t2_shift, seq(.N) <= lag_nbhd_condo_t2_shift, NA)] + ), + by = .(meta_nbhd_code) +][ + !sv_is_outlier & !ind_pin_is_multicard, + `:=`( + time_sale_roll_mean_nbhd_sf_t0_w1 = data.table::frollmean( + lag_nbhd_sf_t0_price, + 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t0_w2 = data.table::frollmean( + lag_nbhd_sf_t0_price, + 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t0_w3 = data.table::frollmean( + lag_nbhd_sf_t0_price, + 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t1_w1 = data.table::frollmean( + lag_nbhd_sf_t1_price, + 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t1_w2 = data.table::frollmean( + lag_nbhd_sf_t1_price, + 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t1_w3 = data.table::frollmean( + lag_nbhd_sf_t1_price, + 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t2_w1 = data.table::frollmean( + lag_nbhd_sf_t2_price, + 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t2_w2 = data.table::frollmean( + lag_nbhd_sf_t2_price, + 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_sf_t2_w3 = data.table::frollmean( + lag_nbhd_sf_t2_price, + 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t0_w1 = data.table::frollmean( + lag_nbhd_condo_t0_price, + 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t0_w2 = data.table::frollmean( + lag_nbhd_condo_t0_price, + 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t0_w3 = data.table::frollmean( + lag_nbhd_condo_t0_price, + 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t1_w1 = data.table::frollmean( + lag_nbhd_condo_t1_price, + 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t1_w2 = data.table::frollmean( + lag_nbhd_condo_t1_price, + 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t1_w3 = data.table::frollmean( + lag_nbhd_condo_t1_price, + 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t2_w1 = data.table::frollmean( + lag_nbhd_condo_t2_price, + 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t2_w2 = data.table::frollmean( + lag_nbhd_condo_t2_price, + 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ), + time_sale_roll_mean_nbhd_condo_t2_w3 = data.table::frollmean( + lag_nbhd_condo_t2_price, + 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + align = "right", + adaptive = TRUE, + na.rm = TRUE, + hasNA = TRUE + ) + ), +] + +all_sales_data_dt <- all_sales_data_dt %>% + mutate(across(.cols = everything(), ~ifelse(is.nan(.x), NA, .x))) + +# Join rolling sales means for condo and single-family sales onto training data +training_data_clean <- training_data_clean %>% + left_join( + all_sales_data_dt %>% + select(meta_sale_document_num, starts_with("time")), + by = "meta_sale_document_num" + ) + ## 4.2. Assessment Data -------------------------------------------------------- # Clean the assessment data. This is the target data that the trained model is @@ -455,6 +691,18 @@ assessment_data_clean <- assessment_data %>% as_tibble() +assessment_data_clean <- assessment_data_clean %>% + left_join( + all_sales_data_dt %>% + group_by(meta_nbhd_code) %>% + arrange(desc(meta_sale_date)) %>% + fill(starts_with("time"), .direction = "up") %>% + slice_head(n = 1) %>% + ungroup() %>% + select(meta_nbhd_code, starts_with("time")), + by = "meta_nbhd_code" + ) + ## 4.3. Land Rates ------------------------------------------------------------- message("Saving land rates") From ef9f870a0969bc2bc2f76e8acdd3fbeccb356170 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 28 Jan 2025 20:07:52 +0000 Subject: [PATCH 02/17] Linting --- pipeline/00-ingest.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 5f3c17e..7e9cff3 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -451,8 +451,8 @@ all_sales_data <- sf_sales_data %>% ) ) %>% mutate( - meta_sale_price_sf = ifelse(regression_group == 'sf', meta_sale_price, NA), - meta_sale_price_condo = ifelse(regression_group == 'condo', meta_sale_price, NA), + meta_sale_price_sf = ifelse(regression_group == "sf", meta_sale_price, NA), + meta_sale_price_condo = ifelse(regression_group == "condo", meta_sale_price, NA), ) %>% arrange(meta_sale_date) @@ -627,7 +627,7 @@ all_sales_data_dt <- all_sales_data[ ] all_sales_data_dt <- all_sales_data_dt %>% - mutate(across(.cols = everything(), ~ifelse(is.nan(.x), NA, .x))) + mutate(across(.cols = everything(), ~ ifelse(is.nan(.x), NA, .x))) # Join rolling sales means for condo and single-family sales onto training data training_data_clean <- training_data_clean %>% From 54c9dcb1209f4c315620414812fb2d77deaa2adb Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 28 Jan 2025 20:27:24 +0000 Subject: [PATCH 03/17] Linting --- pipeline/00-ingest.R | 74 +++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 7e9cff3..ccef58e 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -444,7 +444,10 @@ all_sales_data <- sf_sales_data %>% mutate(regression_group = "sf") %>% bind_rows( training_data_clean %>% - select(meta_sale_document_num, meta_sale_price, meta_sale_date, sv_is_outlier, meta_township_code, meta_nbhd_code) %>% + select( + meta_sale_document_num, meta_sale_price, meta_sale_date, sv_is_outlier, + meta_township_code, meta_nbhd_code + ) %>% mutate( ind_pin_is_multicard = FALSE, regression_group = "condo" @@ -460,20 +463,35 @@ all_sales_data_dt <- all_sales_data[ !sv_is_outlier & !ind_pin_is_multicard, `:=`( lag_nbhd_sf_t0_price = data.table::shift(meta_sale_price_sf, 1, type = "lag"), - lag_nbhd_sf_t1_shift = (1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date)) * 2 - 1, - lag_nbhd_sf_t2_shift = (1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date)) * 2 - 1, - lag_nbhd_condo_t0_price = data.table::shift(meta_sale_price_condo, 1, type = "lag"), - lag_nbhd_condo_t1_shift = (1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date)) * 2 - 1, - lag_nbhd_condo_t2_shift = (1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date)) * 2 - 1 + lag_nbhd_sf_t1_shift = (seq_len(.N) - findInterval(meta_sale_date %m-% months(3), + meta_sale_date)) * 2 - 1, + lag_nbhd_sf_t2_shift = (seq_len(.N) - findInterval(meta_sale_date %m-% months(12), + meta_sale_date)) * 2 - 1, + lag_nbhd_condo_t0_price = data.table::shift(meta_sale_price_condo, 1, + type = "lag"), + lag_nbhd_condo_t1_shift = (seq_len(.N) - findInterval( + meta_sale_date %m-% months(3), meta_sale_date + )) * 2 - 1, + lag_nbhd_condo_t2_shift = (seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + )) * 2 - 1 ), by = .(meta_nbhd_code) ][ !sv_is_outlier & !ind_pin_is_multicard, `:=`( - lag_nbhd_sf_t1_price = meta_sale_price_sf[replace(seq(.N) - lag_nbhd_sf_t1_shift, seq(.N) <= lag_nbhd_sf_t1_shift, NA)], - lag_nbhd_sf_t2_price = meta_sale_price_sf[replace(seq(.N) - lag_nbhd_sf_t2_shift, seq(.N) <= lag_nbhd_sf_t2_shift, NA)], - lag_nbhd_condo_t1_price = meta_sale_price_condo[replace(seq(.N) - lag_nbhd_condo_t1_shift, seq(.N) <= lag_nbhd_condo_t1_shift, NA)], - lag_nbhd_condo_t2_price = meta_sale_price_condo[replace(seq(.N) - lag_nbhd_condo_t2_shift, seq(.N) <= lag_nbhd_condo_t2_shift, NA)] + lag_nbhd_sf_t1_price = meta_sale_price_sf[replace( + seq(.N) - lag_nbhd_sf_t1_shift, seq(.N) <= lag_nbhd_sf_t1_shift, NA + )], + lag_nbhd_sf_t2_price = meta_sale_price_sf[replace( + seq(.N) - lag_nbhd_sf_t2_shift, seq(.N) <= lag_nbhd_sf_t2_shift, NA + )], + lag_nbhd_condo_t1_price = meta_sale_price_condo[replace( + seq(.N) - lag_nbhd_condo_t1_shift, seq(.N) <= lag_nbhd_condo_t1_shift, NA + )], + lag_nbhd_condo_t2_price = meta_sale_price_condo[replace( + seq(.N) - lag_nbhd_condo_t2_shift, seq(.N) <= lag_nbhd_condo_t2_shift, NA + )] ), by = .(meta_nbhd_code) ][ @@ -481,7 +499,7 @@ all_sales_data_dt <- all_sales_data[ `:=`( time_sale_roll_mean_nbhd_sf_t0_w1 = data.table::frollmean( lag_nbhd_sf_t0_price, - 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -489,7 +507,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t0_w2 = data.table::frollmean( lag_nbhd_sf_t0_price, - 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -497,7 +515,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t0_w3 = data.table::frollmean( lag_nbhd_sf_t0_price, - 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -505,7 +523,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t1_w1 = data.table::frollmean( lag_nbhd_sf_t1_price, - 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -513,7 +531,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t1_w2 = data.table::frollmean( lag_nbhd_sf_t1_price, - 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -521,7 +539,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t1_w3 = data.table::frollmean( lag_nbhd_sf_t1_price, - 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -529,7 +547,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t2_w1 = data.table::frollmean( lag_nbhd_sf_t2_price, - 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -537,7 +555,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t2_w2 = data.table::frollmean( lag_nbhd_sf_t2_price, - 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -545,7 +563,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t2_w3 = data.table::frollmean( lag_nbhd_sf_t2_price, - 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -553,7 +571,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t0_w1 = data.table::frollmean( lag_nbhd_condo_t0_price, - 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -561,7 +579,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t0_w2 = data.table::frollmean( lag_nbhd_condo_t0_price, - 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -569,7 +587,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t0_w3 = data.table::frollmean( lag_nbhd_condo_t0_price, - 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -577,7 +595,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t1_w1 = data.table::frollmean( lag_nbhd_condo_t1_price, - 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -585,7 +603,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t1_w2 = data.table::frollmean( lag_nbhd_condo_t1_price, - 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -593,7 +611,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t1_w3 = data.table::frollmean( lag_nbhd_condo_t1_price, - 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -601,7 +619,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t2_w1 = data.table::frollmean( lag_nbhd_condo_t2_price, - 1:.N - findInterval(meta_sale_date %m-% months(3), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(3), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -609,7 +627,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t2_w2 = data.table::frollmean( lag_nbhd_condo_t2_price, - 1:.N - findInterval(meta_sale_date %m-% months(6), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(6), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -617,7 +635,7 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t2_w3 = data.table::frollmean( lag_nbhd_condo_t2_price, - 1:.N - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), align = "right", adaptive = TRUE, na.rm = TRUE, From 25f78467bf76f3179c2e18cdb599779e79e70d60 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 28 Jan 2025 20:29:43 +0000 Subject: [PATCH 04/17] Re-knit readme --- README.md | 23 +++++++++++++++++++++++ docs/data-dict.csv | 23 +++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/README.md b/README.md index b0c92d2..4e9767f 100644 --- a/README.md +++ b/README.md @@ -114,10 +114,15 @@ ones used in the most recent assessment model. | Total Condominium Building Non-Livable Parcels | char_building_non_units | Count of non-livable 14-digit PINs | Characteristic | numeric | X | | Condominium Building Is Mixed Use | char_bldg_is_mixed_use | The 10-digit PIN (building) contains a 14-digit PIN that is neither class 299 nor 399 | Characteristic | logical | X | | Total Condominium Building Square Footage | char_building_sf | Square footage of the *building* (PIN10) containing this unit | Characteristic | numeric | X | +| Building Square Footage | char_building_sf | Square footage of the *building* (PIN10) containing this unit | Characteristic | numeric | X | | Condominium Unit Square Footage | char_unit_sf | Square footage of the condominium unit associated with this PIN | Characteristic | numeric | X | +| Unit Square Footage | char_unit_sf | Square footage of the condominium unit associated with this PIN | Characteristic | numeric | X | | Condominium Unit Bedrooms | char_bedrooms | Number of bedrooms in the building | Characteristic | numeric | X | +| Bedrooms | char_bedrooms | Number of bedrooms in the building | Characteristic | numeric | X | | Condominium Unit Half Baths | char_half_baths | Number of half baths | Characteristic | numeric | X | +| Half Baths | char_half_baths | Number of half baths | Characteristic | numeric | X | | Condominium Unit Full Baths | char_full_baths | Number of full bathrooms | Characteristic | numeric | X | +| Full Baths | char_full_baths | Number of full bathrooms | Characteristic | numeric | X | | Condominium % Ownership | meta_tieback_proration_rate | Proration rate applied to the PIN | Meta | numeric | X | | Condominium Building Strata 1 | meta_strata_1 | Condominium Building Strata - 10 Levels | Meta | character | X | | Condominium Building Strata 2 | meta_strata_2 | Condominium Building Strata - 100 Levels | Meta | character | X | @@ -135,6 +140,24 @@ ones used in the most recent assessment model. | Average Daily Traffic Count on Nearest Collector Road | prox_nearest_road_collector_daily_traffic | Daily traffic of nearest collector road | Proximity | numeric | X | | Nearest New Construction (Feet) | prox_nearest_new_construction_dist_ft | Nearest new construction distance (feet) | Proximity | numeric | X | | Nearest Major Stadium (Feet) | prox_nearest_stadium_dist_ft | Nearest stadium distance (feet) | Proximity | numeric | X | +| NA | time_sale_roll_mean_nbhd_sf_t0_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t0_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t0_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t1_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t1_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t1_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t2_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t2_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_sf_t2_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t0_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t0_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t0_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t1_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t1_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t1_w3 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t2_w1 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t2_w2 | | NA | NA | X | +| NA | time_sale_roll_mean_nbhd_condo_t2_w3 | | NA | NA | X | | Percent Population Age, Under 19 Years Old | acs5_percent_age_children | Percent of the people 17 years or younger | ACS5 | numeric | | | Percent Population Age, Over 65 Years Old | acs5_percent_age_senior | Percent of the people 65 years or older | ACS5 | numeric | | | Median Population Age | acs5_median_age_total | Median age for whole population | ACS5 | numeric | | diff --git a/docs/data-dict.csv b/docs/data-dict.csv index fd8cbac..d902a23 100644 --- a/docs/data-dict.csv +++ b/docs/data-dict.csv @@ -4,10 +4,15 @@ Total Condominium Building Livable Parcels,char_building_units,Count of livable Total Condominium Building Non-Livable Parcels,char_building_non_units,Count of non-livable 14-digit PINs,Characteristic,numeric,TRUE Condominium Building Is Mixed Use,char_bldg_is_mixed_use,The 10-digit PIN (building) contains a 14-digit PIN that is neither class 299 nor 399,Characteristic,logical,TRUE Total Condominium Building Square Footage,char_building_sf,Square footage of the _building_ (PIN10) containing this unit,Characteristic,numeric,TRUE +Building Square Footage,char_building_sf,Square footage of the _building_ (PIN10) containing this unit,Characteristic,numeric,TRUE Condominium Unit Square Footage,char_unit_sf,Square footage of the condominium unit associated with this PIN,Characteristic,numeric,TRUE +Unit Square Footage,char_unit_sf,Square footage of the condominium unit associated with this PIN,Characteristic,numeric,TRUE Condominium Unit Bedrooms,char_bedrooms,Number of bedrooms in the building,Characteristic,numeric,TRUE +Bedrooms,char_bedrooms,Number of bedrooms in the building,Characteristic,numeric,TRUE Condominium Unit Half Baths,char_half_baths,Number of half baths,Characteristic,numeric,TRUE +Half Baths,char_half_baths,Number of half baths,Characteristic,numeric,TRUE Condominium Unit Full Baths,char_full_baths,Number of full bathrooms,Characteristic,numeric,TRUE +Full Baths,char_full_baths,Number of full bathrooms,Characteristic,numeric,TRUE Condominium % Ownership,meta_tieback_proration_rate,Proration rate applied to the PIN,Meta,numeric,TRUE Condominium Building Strata 1,meta_strata_1,Condominium Building Strata - 10 Levels,Meta,character,TRUE Condominium Building Strata 2,meta_strata_2,Condominium Building Strata - 100 Levels,Meta,character,TRUE @@ -25,6 +30,24 @@ Average Daily Traffic Count on Nearest Arterial Road,prox_nearest_road_arterial_ Average Daily Traffic Count on Nearest Collector Road,prox_nearest_road_collector_daily_traffic,Daily traffic of nearest collector road,Proximity,numeric,TRUE Nearest New Construction (Feet),prox_nearest_new_construction_dist_ft,Nearest new construction distance (feet),Proximity,numeric,TRUE Nearest Major Stadium (Feet),prox_nearest_stadium_dist_ft,Nearest stadium distance (feet),Proximity,numeric,TRUE +NA,time_sale_roll_mean_nbhd_sf_t0_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t0_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t0_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t1_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t1_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t1_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t2_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t2_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_sf_t2_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t0_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t0_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t0_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t1_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t1_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t1_w3,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t2_w1,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t2_w2,,NA,NA,TRUE +NA,time_sale_roll_mean_nbhd_condo_t2_w3,,NA,NA,TRUE "Percent Population Age, Under 19 Years Old",acs5_percent_age_children,Percent of the people 17 years or younger,ACS5,numeric,FALSE "Percent Population Age, Over 65 Years Old",acs5_percent_age_senior,Percent of the people 65 years or older,ACS5,numeric,FALSE Median Population Age,acs5_median_age_total,Median age for whole population,ACS5,numeric,FALSE From 8d8fef0e3019ea982b97f53a24bbd83754a92440 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 28 Jan 2025 20:32:56 +0000 Subject: [PATCH 05/17] Linting --- pipeline/00-ingest.R | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index ccef58e..a9ea0bc 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -447,7 +447,7 @@ all_sales_data <- sf_sales_data %>% select( meta_sale_document_num, meta_sale_price, meta_sale_date, sv_is_outlier, meta_township_code, meta_nbhd_code - ) %>% + ) %>% mutate( ind_pin_is_multicard = FALSE, regression_group = "condo" @@ -463,18 +463,23 @@ all_sales_data_dt <- all_sales_data[ !sv_is_outlier & !ind_pin_is_multicard, `:=`( lag_nbhd_sf_t0_price = data.table::shift(meta_sale_price_sf, 1, type = "lag"), - lag_nbhd_sf_t1_shift = (seq_len(.N) - findInterval(meta_sale_date %m-% months(3), - meta_sale_date)) * 2 - 1, - lag_nbhd_sf_t2_shift = (seq_len(.N) - findInterval(meta_sale_date %m-% months(12), - meta_sale_date)) * 2 - 1, + lag_nbhd_sf_t1_shift = (seq_len(.N) - findInterval( + meta_sale_date %m-% months(3), + meta_sale_date + )) * 2 - 1, + lag_nbhd_sf_t2_shift = (seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), + meta_sale_date + )) * 2 - 1, lag_nbhd_condo_t0_price = data.table::shift(meta_sale_price_condo, 1, - type = "lag"), + type = "lag" + ), lag_nbhd_condo_t1_shift = (seq_len(.N) - findInterval( meta_sale_date %m-% months(3), meta_sale_date - )) * 2 - 1, + )) * 2 - 1, lag_nbhd_condo_t2_shift = (seq_len(.N) - findInterval( meta_sale_date %m-% months(12), meta_sale_date - )) * 2 - 1 + )) * 2 - 1 ), by = .(meta_nbhd_code) ][ @@ -482,16 +487,16 @@ all_sales_data_dt <- all_sales_data[ `:=`( lag_nbhd_sf_t1_price = meta_sale_price_sf[replace( seq(.N) - lag_nbhd_sf_t1_shift, seq(.N) <= lag_nbhd_sf_t1_shift, NA - )], + )], lag_nbhd_sf_t2_price = meta_sale_price_sf[replace( seq(.N) - lag_nbhd_sf_t2_shift, seq(.N) <= lag_nbhd_sf_t2_shift, NA - )], + )], lag_nbhd_condo_t1_price = meta_sale_price_condo[replace( seq(.N) - lag_nbhd_condo_t1_shift, seq(.N) <= lag_nbhd_condo_t1_shift, NA - )], + )], lag_nbhd_condo_t2_price = meta_sale_price_condo[replace( seq(.N) - lag_nbhd_condo_t2_shift, seq(.N) <= lag_nbhd_condo_t2_shift, NA - )] + )] ), by = .(meta_nbhd_code) ][ From a279cccfb560b27fc87ebdc86fa27d5d2b56ce30 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 28 Jan 2025 20:38:47 +0000 Subject: [PATCH 06/17] Linting --- pipeline/00-ingest.R | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index a9ea0bc..f3e0912 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -455,14 +455,19 @@ all_sales_data <- sf_sales_data %>% ) %>% mutate( meta_sale_price_sf = ifelse(regression_group == "sf", meta_sale_price, NA), - meta_sale_price_condo = ifelse(regression_group == "condo", meta_sale_price, NA), + meta_sale_price_condo = ifelse( + regression_group == "condo", meta_sale_price, NA + ), ) %>% arrange(meta_sale_date) all_sales_data_dt <- all_sales_data[ !sv_is_outlier & !ind_pin_is_multicard, `:=`( - lag_nbhd_sf_t0_price = data.table::shift(meta_sale_price_sf, 1, type = "lag"), + lag_nbhd_sf_t0_price = data.table::shift( + meta_sale_price_sf, 1, + type = "lag" + ), lag_nbhd_sf_t1_shift = (seq_len(.N) - findInterval( meta_sale_date %m-% months(3), meta_sale_date @@ -520,7 +525,9 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t0_w3 = data.table::frollmean( lag_nbhd_sf_t0_price, - seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -544,7 +551,9 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t1_w3 = data.table::frollmean( lag_nbhd_sf_t1_price, - seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -568,7 +577,9 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_sf_t2_w3 = data.table::frollmean( lag_nbhd_sf_t2_price, - seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -592,7 +603,9 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t0_w3 = data.table::frollmean( lag_nbhd_condo_t0_price, - seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -616,7 +629,9 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t1_w3 = data.table::frollmean( lag_nbhd_condo_t1_price, - seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), align = "right", adaptive = TRUE, na.rm = TRUE, @@ -640,7 +655,9 @@ all_sales_data_dt <- all_sales_data[ ), time_sale_roll_mean_nbhd_condo_t2_w3 = data.table::frollmean( lag_nbhd_condo_t2_price, - seq_len(.N) - findInterval(meta_sale_date %m-% months(12), meta_sale_date), + seq_len(.N) - findInterval( + meta_sale_date %m-% months(12), meta_sale_date + ), align = "right", adaptive = TRUE, na.rm = TRUE, From bdebe0aa4bc591da232e9ba580cf4e9a42b81301 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 28 Jan 2025 20:45:36 +0000 Subject: [PATCH 07/17] Commenting --- pipeline/00-ingest.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index f3e0912..098d3db 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -123,8 +123,8 @@ land_nbhd_rate_data <- dbGetQuery( ) tictoc::toc() -# Pull the single family sales data. It will only be used to construct rolling -# price averages by neighborhood. +# Pull single family sales data to construct rolling price averages by +# neighborhood. tictoc::tic("Single-family sales data pulled") sf_sales_data <- dbGetQuery( conn = AWS_ATHENA_CONN_NOCTUA, glue(" @@ -156,7 +156,7 @@ sf_sales_data <- dbGetQuery( sv_is_outlier = replace_na(sv_is_outlier, FALSE), ind_pin_is_multicard = FALSE ) %>% - # Keep multicard sales since we are only using them to construct sale price + # We keep multicard sales since we are only using them to construct sale price # trends, but we still need the sales sample to be unique by document number distinct(meta_sale_document_num, .keep_all = TRUE) tictoc::toc() @@ -730,7 +730,8 @@ assessment_data_clean <- assessment_data %>% relocate(starts_with("char_"), .after = starts_with("ind_")) %>% as_tibble() - +# Join rolling sales means for condo and single-family sales onto assessment +# data assessment_data_clean <- assessment_data_clean %>% left_join( all_sales_data_dt %>% From abc5b528fafec7641b7206a0b5d38766bd5351a2 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Tue, 28 Jan 2025 21:04:17 +0000 Subject: [PATCH 08/17] Update dvc --- dvc.lock | 20 ++--- dvc.yaml | 245 +++++++++++++++++++++++++++---------------------------- 2 files changed, 132 insertions(+), 133 deletions(-) diff --git a/dvc.lock b/dvc.lock index 78d2332..5ac2d88 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: pipeline/00-ingest.R hash: md5 - md5: 816b28ff1c68d17a9082d0dc839a85c0 - size: 22844 + md5: 23cc4b47117df6743662074b59c73974 + size: 32149 params: params.yaml: assessment: @@ -31,24 +31,24 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: 9a13f7248f1d80079be339ed1d995088 - size: 86228842 + md5: 1e6172720428b0b27c223322dc1286e4 + size: 88749701 - path: input/char_data.parquet hash: md5 - md5: 23b25c36873492d884125a3c8ee2dfbb - size: 160028159 + md5: 85c92e5918961520b4e082a13208ce07 + size: 158484898 - path: input/condo_strata_data.parquet hash: md5 - md5: b5a85462a7f4de94916b228be45ccd75 + md5: 6506d1ba3229ff42883c78d6db9adf6d size: 40543 - path: input/land_nbhd_rate_data.parquet hash: md5 - md5: f3ec9627322bd271bf2957b7388aaa34 + md5: 4b3c72021c15daf8309d0029987da9f2 size: 3873 - path: input/training_data.parquet hash: md5 - md5: e818848026f6dc6e3d6af9b8d6b34641 - size: 79923460 + md5: 75901c01f816512e182cf2af3f2f7787 + size: 102870818 train: cmd: Rscript pipeline/01-train.R deps: diff --git a/dvc.yaml b/dvc.yaml index 8c6dfcf..b6eaefa 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -4,52 +4,51 @@ stages: desc: > Ingest training and assessment data from Athena + generate condo strata deps: - - pipeline/00-ingest.R + - pipeline/00-ingest.R params: - - assessment - - input + - assessment + - input outs: - - input/assessment_data.parquet - - input/char_data.parquet - - input/condo_strata_data.parquet - - input/land_nbhd_rate_data.parquet - - input/training_data.parquet + - input/assessment_data.parquet + - input/char_data.parquet + - input/condo_strata_data.parquet + - input/land_nbhd_rate_data.parquet + - input/training_data.parquet frozen: true - train: cmd: Rscript pipeline/01-train.R desc: > Train a LightGBM model with cross-validation. Generate model objects, data recipes, and predictions on the test set (most recent 10% of sales) deps: - - pipeline/01-train.R - - input/training_data.parquet + - pipeline/01-train.R + - input/training_data.parquet params: - - cv - - model.engine - - model.hyperparameter - - model.objective - - model.parameter - - model.predictor - - model.seed - - model.verbose - - ratio_study - - toggle.cv_enable + - cv + - model.engine + - model.hyperparameter + - model.objective + - model.parameter + - model.predictor + - model.seed + - model.verbose + - ratio_study + - toggle.cv_enable outs: - - output/intermediate/timing/model_timing_train.parquet: - cache: false - - output/parameter_final/model_parameter_final.parquet: - cache: false - - output/parameter_range/model_parameter_range.parquet: - cache: false - - output/parameter_search/model_parameter_search.parquet: - cache: false - - output/test_card/model_test_card.parquet: - cache: false - - output/workflow/fit/model_workflow_fit.zip: - cache: false - - output/workflow/recipe/model_workflow_recipe.rds: - cache: false + - output/intermediate/timing/model_timing_train.parquet: + cache: false + - output/parameter_final/model_parameter_final.parquet: + cache: false + - output/parameter_range/model_parameter_range.parquet: + cache: false + - output/parameter_search/model_parameter_search.parquet: + cache: false + - output/test_card/model_test_card.parquet: + cache: false + - output/workflow/fit/model_workflow_fit.zip: + cache: false + - output/workflow/recipe/model_workflow_recipe.rds: + cache: false assess: cmd: Rscript pipeline/02-assess.R @@ -58,25 +57,25 @@ stages: County. Also generate flags, calculate land values, and make any post-modeling changes deps: - - pipeline/02-assess.R - - input/assessment_data.parquet - - input/condo_strata_data.parquet - - input/land_nbhd_rate_data.parquet - - input/training_data.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/02-assess.R + - input/assessment_data.parquet + - input/condo_strata_data.parquet + - input/land_nbhd_rate_data.parquet + - input/training_data.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - assessment - - model.predictor.all - - pv - - ratio_study + - assessment + - model.predictor.all + - pv + - ratio_study outs: - - output/assessment_card/model_assessment_card.parquet: - cache: false - - output/assessment_pin/model_assessment_pin.parquet: - cache: false - - output/intermediate/timing/model_timing_assess.parquet: - cache: false + - output/assessment_card/model_assessment_card.parquet: + cache: false + - output/assessment_pin/model_assessment_pin.parquet: + cache: false + - output/intermediate/timing/model_timing_assess.parquet: + cache: false evaluate: cmd: Rscript pipeline/03-evaluate.R @@ -86,23 +85,23 @@ stages: 2. An assessor-specific ratio study comparing estimated assessments to the previous year's sales deps: - - pipeline/03-evaluate.R - - output/assessment_pin/model_assessment_pin.parquet - - output/test_card/model_test_card.parquet + - pipeline/03-evaluate.R + - output/assessment_pin/model_assessment_pin.parquet + - output/test_card/model_test_card.parquet params: - - assessment - - ratio_study + - assessment + - ratio_study outs: - - output/performance/model_performance_test.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_test.parquet: - cache: false - - output/performance/model_performance_assessment.parquet: - cache: false - - output/performance_quantile/model_performance_quantile_assessment.parquet: - cache: false - - output/intermediate/timing/model_timing_evaluate.parquet: - cache: false + - output/performance/model_performance_test.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_test.parquet: + cache: false + - output/performance/model_performance_assessment.parquet: + cache: false + - output/performance_quantile/model_performance_quantile_assessment.parquet: + cache: false + - output/intermediate/timing/model_timing_evaluate.parquet: + cache: false interpret: cmd: Rscript pipeline/04-interpret.R @@ -110,20 +109,20 @@ stages: Generate SHAP values for each card and feature as well as feature importance metrics for each feature deps: - - pipeline/04-interpret.R - - input/assessment_data.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds + - pipeline/04-interpret.R + - input/assessment_data.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds params: - - toggle.shap_enable - - model.predictor.all + - toggle.shap_enable + - model.predictor.all outs: - - output/shap/model_shap.parquet: - cache: false - - output/feature_importance/model_feature_importance.parquet: - cache: false - - output/intermediate/timing/model_timing_interpret.parquet: - cache: false + - output/shap/model_shap.parquet: + cache: false + - output/feature_importance/model_feature_importance.parquet: + cache: false + - output/intermediate/timing/model_timing_interpret.parquet: + cache: false finalize: cmd: Rscript pipeline/05-finalize.R @@ -131,28 +130,28 @@ stages: Save run timings and run metadata to disk and render a performance report using Quarto. deps: - - pipeline/05-finalize.R - - output/intermediate/timing/model_timing_train.parquet - - output/intermediate/timing/model_timing_assess.parquet - - output/intermediate/timing/model_timing_evaluate.parquet - - output/intermediate/timing/model_timing_interpret.parquet + - pipeline/05-finalize.R + - output/intermediate/timing/model_timing_train.parquet + - output/intermediate/timing/model_timing_assess.parquet + - output/intermediate/timing/model_timing_evaluate.parquet + - output/intermediate/timing/model_timing_interpret.parquet params: - - run_note - - toggle - - input - - cv - - model - - pv - - ratio_study + - run_note + - toggle + - input + - cv + - model + - pv + - ratio_study outs: - - output/intermediate/timing/model_timing_finalize.parquet: - cache: false - - output/timing/model_timing.parquet: - cache: false - - output/metadata/model_metadata.parquet: - cache: false - - reports/performance/performance.html: - cache: false + - output/intermediate/timing/model_timing_finalize.parquet: + cache: false + - output/timing/model_timing.parquet: + cache: false + - output/metadata/model_metadata.parquet: + cache: false + - reports/performance/performance.html: + cache: false upload: cmd: Rscript pipeline/06-upload.R @@ -162,24 +161,24 @@ stages: outputs prior to upload and attach a unique run ID. This step requires access to the CCAO Data AWS account, and so is assumed to be internal-only deps: - - pipeline/06-upload.R - - output/parameter_final/model_parameter_final.parquet - - output/parameter_range/model_parameter_range.parquet - - output/parameter_search/model_parameter_search.parquet - - output/workflow/fit/model_workflow_fit.zip - - output/workflow/recipe/model_workflow_recipe.rds - - output/test_card/model_test_card.parquet - - output/assessment_card/model_assessment_card.parquet - - output/assessment_pin/model_assessment_pin.parquet - - output/performance/model_performance_test.parquet - - output/performance_quantile/model_performance_quantile_test.parquet - - output/performance/model_performance_assessment.parquet - - output/performance_quantile/model_performance_quantile_assessment.parquet - - output/shap/model_shap.parquet - - output/feature_importance/model_feature_importance.parquet - - output/metadata/model_metadata.parquet - - output/timing/model_timing.parquet - - reports/performance/performance.html + - pipeline/06-upload.R + - output/parameter_final/model_parameter_final.parquet + - output/parameter_range/model_parameter_range.parquet + - output/parameter_search/model_parameter_search.parquet + - output/workflow/fit/model_workflow_fit.zip + - output/workflow/recipe/model_workflow_recipe.rds + - output/test_card/model_test_card.parquet + - output/assessment_card/model_assessment_card.parquet + - output/assessment_pin/model_assessment_pin.parquet + - output/performance/model_performance_test.parquet + - output/performance_quantile/model_performance_quantile_test.parquet + - output/performance/model_performance_assessment.parquet + - output/performance_quantile/model_performance_quantile_assessment.parquet + - output/shap/model_shap.parquet + - output/feature_importance/model_feature_importance.parquet + - output/metadata/model_metadata.parquet + - output/timing/model_timing.parquet + - reports/performance/performance.html export: cmd: Rscript pipeline/07-export.R @@ -188,11 +187,11 @@ stages: run. NOT automatically run since it is typically only run once. Manually run once a model is selected deps: - - pipeline/07-export.R + - pipeline/07-export.R params: - - assessment.year - - input.min_sale_year - - input.max_sale_year - - ratio_study - - export + - assessment.year + - input.min_sale_year + - input.max_sale_year + - ratio_study + - export frozen: true From c41e13e8a970c48e4d7516a3bb3270cddbf190ea Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 29 Jan 2025 17:37:41 +0000 Subject: [PATCH 09/17] Fill NAs in training set --- DESCRIPTION | 3 ++- pipeline/00-ingest.R | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 04a0340..9161ef9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,8 @@ Depends: tune, workflows, yaml, - yardstick + yardstick, + zoo Remotes: ccao-data/assessr, ccao-data/ccao, diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 098d3db..6bc3756 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -666,13 +666,21 @@ all_sales_data_dt <- all_sales_data[ ), ] +# Remove NaNs all_sales_data_dt <- all_sales_data_dt %>% - mutate(across(.cols = everything(), ~ ifelse(is.nan(.x), NA, .x))) + mutate(across(.cols = starts_with("time"), ~ ifelse(is.nan(.x), NA, .x))) -# Join rolling sales means for condo and single-family sales onto training data +# Join rolling sales means for condo and single-family sales onto training data. +# First make sure NAs are filled using linear approximation, then backwards fill +# for first sale in each neighborhood. training_data_clean <- training_data_clean %>% left_join( all_sales_data_dt %>% + group_by(meta_nbhd_code) %>% + arrange(meta_sale_date) %>% + mutate(across(starts_with("time"), ~ na.approx(.x, na.rm = FALSE))) %>% + fill(starts_with("time"), .direction = "up") %>% + ungroup() %>% select(meta_sale_document_num, starts_with("time")), by = "meta_sale_document_num" ) @@ -736,8 +744,8 @@ assessment_data_clean <- assessment_data_clean %>% left_join( all_sales_data_dt %>% group_by(meta_nbhd_code) %>% - arrange(desc(meta_sale_date)) %>% - fill(starts_with("time"), .direction = "up") %>% + arrange(meta_sale_date) %>% + fill(starts_with("time"), .direction = "down") %>% slice_head(n = 1) %>% ungroup() %>% select(meta_nbhd_code, starts_with("time")), From 729f171431e76c9355802fbb45a7a03e208a89c1 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 29 Jan 2025 17:44:44 +0000 Subject: [PATCH 10/17] Fix assess fill --- pipeline/00-ingest.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 6bc3756..92270d1 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -744,8 +744,8 @@ assessment_data_clean <- assessment_data_clean %>% left_join( all_sales_data_dt %>% group_by(meta_nbhd_code) %>% - arrange(meta_sale_date) %>% - fill(starts_with("time"), .direction = "down") %>% + arrange(desc(meta_sale_date)) %>% + fill(starts_with("time"), .direction = "up") %>% slice_head(n = 1) %>% ungroup() %>% select(meta_nbhd_code, starts_with("time")), From 8b10c39f3061fea4208e853190fd1343d2907b25 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 29 Jan 2025 17:49:56 +0000 Subject: [PATCH 11/17] Commenting --- pipeline/00-ingest.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 92270d1..55fa204 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -739,7 +739,7 @@ assessment_data_clean <- assessment_data %>% as_tibble() # Join rolling sales means for condo and single-family sales onto assessment -# data +# data. Use the most recent rolling mean per neighborhood. assessment_data_clean <- assessment_data_clean %>% left_join( all_sales_data_dt %>% From 66b6f42be98a9f5a0860df737f33131812f1c705 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 29 Jan 2025 18:20:01 +0000 Subject: [PATCH 12/17] Use time-weighted for interpolation --- pipeline/00-ingest.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 55fa204..7fb9e85 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -671,14 +671,14 @@ all_sales_data_dt <- all_sales_data_dt %>% mutate(across(.cols = starts_with("time"), ~ ifelse(is.nan(.x), NA, .x))) # Join rolling sales means for condo and single-family sales onto training data. -# First make sure NAs are filled using linear approximation, then backwards fill -# for first sale in each neighborhood. +# First make sure NAs are time-weighted linearly interpolated, then backwards +# fill for first sale in each neighborhood. training_data_clean <- training_data_clean %>% left_join( all_sales_data_dt %>% group_by(meta_nbhd_code) %>% arrange(meta_sale_date) %>% - mutate(across(starts_with("time"), ~ na.approx(.x, na.rm = FALSE))) %>% + mutate(across(starts_with("time"), ~ na.approx(.x, x = meta_sale_date, na.rm = FALSE))) %>% fill(starts_with("time"), .direction = "up") %>% ungroup() %>% select(meta_sale_document_num, starts_with("time")), From d27a7fe060874b28547d6251a0c9d3e6447cd29f Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 29 Jan 2025 18:20:19 +0000 Subject: [PATCH 13/17] Lint --- pipeline/00-ingest.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 7fb9e85..5ad7c6c 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -678,7 +678,9 @@ training_data_clean <- training_data_clean %>% all_sales_data_dt %>% group_by(meta_nbhd_code) %>% arrange(meta_sale_date) %>% - mutate(across(starts_with("time"), ~ na.approx(.x, x = meta_sale_date, na.rm = FALSE))) %>% + mutate(across(starts_with("time"), ~ na.approx( + .x, x = meta_sale_date, na.rm = FALSE + ))) %>% fill(starts_with("time"), .direction = "up") %>% ungroup() %>% select(meta_sale_document_num, starts_with("time")), From a324ae4ec2a0fa918564828dd6802670ae69fc0a Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 29 Jan 2025 19:53:31 +0000 Subject: [PATCH 14/17] Remove NA filling --- DESCRIPTION | 3 +-- pipeline/00-ingest.R | 10 +--------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9161ef9..04a0340 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,8 +31,7 @@ Depends: tune, workflows, yaml, - yardstick, - zoo + yardstick Remotes: ccao-data/assessr, ccao-data/ccao, diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 5ad7c6c..56dba5b 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -664,6 +664,7 @@ all_sales_data_dt <- all_sales_data[ hasNA = TRUE ) ), + by = .(meta_nbhd_code) ] # Remove NaNs @@ -671,18 +672,9 @@ all_sales_data_dt <- all_sales_data_dt %>% mutate(across(.cols = starts_with("time"), ~ ifelse(is.nan(.x), NA, .x))) # Join rolling sales means for condo and single-family sales onto training data. -# First make sure NAs are time-weighted linearly interpolated, then backwards -# fill for first sale in each neighborhood. training_data_clean <- training_data_clean %>% left_join( all_sales_data_dt %>% - group_by(meta_nbhd_code) %>% - arrange(meta_sale_date) %>% - mutate(across(starts_with("time"), ~ na.approx( - .x, x = meta_sale_date, na.rm = FALSE - ))) %>% - fill(starts_with("time"), .direction = "up") %>% - ungroup() %>% select(meta_sale_document_num, starts_with("time")), by = "meta_sale_document_num" ) From 480b45a8b1bdaecc502c851a6b4a094864e90d4f Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Wed, 29 Jan 2025 20:30:49 +0000 Subject: [PATCH 15/17] Update dvc --- dvc.lock | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dvc.lock b/dvc.lock index 5bc1678..006d593 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: pipeline/00-ingest.R hash: md5 - md5: 23cc4b47117df6743662074b59c73974 - size: 32149 + md5: 5e5a7064ad082589edb27a08497ecc68 + size: 33538 params: params.yaml: assessment: @@ -31,24 +31,24 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: 1e6172720428b0b27c223322dc1286e4 - size: 88749701 + md5: 4e45e642eb57513bf2e9e8e5aa7062df + size: 78834247 - path: input/char_data.parquet hash: md5 - md5: 85c92e5918961520b4e082a13208ce07 - size: 158484898 + md5: 8d16027948929f5944f5db86b5b43b7f + size: 148499762 - path: input/condo_strata_data.parquet hash: md5 - md5: 6506d1ba3229ff42883c78d6db9adf6d - size: 40543 + md5: ded3ecde590af57e6b98a8935fae0215 + size: 40493 - path: input/land_nbhd_rate_data.parquet hash: md5 - md5: 4b3c72021c15daf8309d0029987da9f2 - size: 3873 + md5: 5fe80edfabdfac91efe888a25ee4051c + size: 6019 - path: input/training_data.parquet hash: md5 - md5: 75901c01f816512e182cf2af3f2f7787 - size: 102870818 + md5: 936eea2a250d1222936eb9f0f81c9d84 + size: 100482987 train: cmd: Rscript pipeline/01-train.R deps: From 0a0eec2e88fd8251e7a23dd7e594834b7cde76c0 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 30 Jan 2025 16:11:27 +0000 Subject: [PATCH 16/17] Improve commenting, data table handling --- R/setup.R | 1 + pipeline/00-ingest.R | 33 ++++++++++++++++----------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/R/setup.R b/R/setup.R index f955b06..e434345 100644 --- a/R/setup.R +++ b/R/setup.R @@ -19,6 +19,7 @@ suppressPackageStartupMessages({ # Resolve package namespace conflicts, preferring the library::function pair # shown over other functions with the same name from different libraries conflicts_prefer( + data.table::`:=`, dplyr::filter, dplyr::first, dplyr::lag, diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 32c23dd..1d1ec5a 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -307,15 +307,15 @@ sf_sales_data <- dbGetQuery( AND NOT sale.sale_filter_deed_type ") ) %>% + # We keep multicard sales since we are only using them to construct sale price + # trends, but we still need the sales sample to be unique by document number + distinct(meta_sale_document_num, .keep_all = TRUE) %>% # Only exclude explicit outliers from training. Sales with missing validation # outcomes will be considered non-outliers mutate( sv_is_outlier = replace_na(sv_is_outlier, FALSE), ind_pin_is_multicard = FALSE - ) %>% - # We keep multicard sales since we are only using them to construct sale price - # trends, but we still need the sales sample to be unique by document number - distinct(meta_sale_document_num, .keep_all = TRUE) + ) tictoc::toc() # Close connection to Athena @@ -505,7 +505,7 @@ all_sales_data <- sf_sales_data %>% ) %>% arrange(meta_sale_date) -all_sales_data_dt <- all_sales_data[ +all_sales_data_rolling <- all_sales_data[ !sv_is_outlier & !ind_pin_is_multicard, `:=`( lag_nbhd_sf_t0_price = data.table::shift( @@ -709,16 +709,15 @@ all_sales_data_dt <- all_sales_data[ ) ), by = .(meta_nbhd_code) -] - -# Remove NaNs -all_sales_data_dt <- all_sales_data_dt %>% +] %>% + as_tibble() %>% + # Replace NaNs mutate(across(.cols = starts_with("time"), ~ ifelse(is.nan(.x), NA, .x))) # Join rolling sales means for condo and single-family sales onto training data. training_data_clean <- training_data_clean %>% left_join( - all_sales_data_dt %>% + all_sales_data_rolling %>% select(meta_sale_document_num, starts_with("time")), by = "meta_sale_document_num" ) @@ -774,15 +773,15 @@ assessment_data_clean <- assessment_data %>% ) %>% relocate(starts_with("ind_"), .after = starts_with("meta_")) %>% relocate(starts_with("char_"), .after = starts_with("ind_")) %>% - as_tibble() - -# Join rolling sales means for condo and single-family sales onto assessment -# data. Use the most recent rolling mean per neighborhood. -assessment_data_clean <- assessment_data_clean %>% + as_tibble() %>% + # Join rolling sales means for condo and single-family sales onto assessment + # data. Use the most recent rolling mean per neighborhood. There is no need to + # remove outliers from the right side of the join since they are empty and + # cannot influence forward-filling left_join( - all_sales_data_dt %>% + all_sales_data_rolling %>% group_by(meta_nbhd_code) %>% - arrange(desc(meta_sale_date)) %>% + arrange(desc(meta_sale_date), .by_group = TRUE) %>% fill(starts_with("time"), .direction = "up") %>% slice_head(n = 1) %>% ungroup() %>% From 2448188e164e3349e8a41d572ef02b0c95369ff9 Mon Sep 17 00:00:00 2001 From: Sweaty Handshake Date: Thu, 30 Jan 2025 16:27:53 +0000 Subject: [PATCH 17/17] Update data --- dvc.lock | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dvc.lock b/dvc.lock index 006d593..4a30f78 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: pipeline/00-ingest.R hash: md5 - md5: 5e5a7064ad082589edb27a08497ecc68 - size: 33538 + md5: b91e2ec22113406aae490edb36fbb7dd + size: 33642 params: params.yaml: assessment: @@ -31,12 +31,12 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: 4e45e642eb57513bf2e9e8e5aa7062df - size: 78834247 + md5: a8e25e4fe7e62b85b63d5175d82603e8 + size: 79249396 - path: input/char_data.parquet hash: md5 - md5: 8d16027948929f5944f5db86b5b43b7f - size: 148499762 + md5: c413791724db9725659ef536c429602f + size: 149205649 - path: input/condo_strata_data.parquet hash: md5 md5: ded3ecde590af57e6b98a8935fae0215 @@ -47,8 +47,8 @@ stages: size: 6019 - path: input/training_data.parquet hash: md5 - md5: 936eea2a250d1222936eb9f0f81c9d84 - size: 100482987 + md5: 17ff61dc7cd2ecd272d46f9777b80080 + size: 100488708 train: cmd: Rscript pipeline/01-train.R deps: