From b7e0cc40148aad5fde772c819287620b78bcfcc5 Mon Sep 17 00:00:00 2001
From: Dan Snow <31494343+dfsnow@users.noreply.github.com>
Date: Thu, 15 Feb 2024 10:46:25 -0600
Subject: [PATCH] Update training data sales sample and outlier flags (#30)

* Re-add proration rate/pct ownership as predictor

* Drop underperforming features

Based on mean absolute SHAP value

* Re-add building mixed use indicator

* Add grouping variables to arrange()

* Update input data with fixed indicators

* Re-add meta_modeling_group as a predictor

* Drop nonlivable sales from training data

* Value nonlivable spaces with model + cap

* Value nonlivables as a function of per ass and total livable value

* Pass aggregation correct object name

* Pass aggregation correct value name

* Add min bound to condo preds

* Drop extra comma in pmax()

* Revert parameter changes

* Revert assessment stage changes

* Exclude nonlivable spaces from training, ratio study

* Improve commenting

* Revert rounding settings to match 2023

* Update condo value apportionment methodology

This commit changes the valuation methodology for non-livable units
(parking, storage, etc.). It pegs their value to their proration rate
multiplied by the total value of a buildings livable units. This
obviates the need for a fixed non-livable unit value.

* Update condo value apportionment methodology

This commit changes the valuation methodology for non-livable units
(parking, storage, etc.). It pegs their value to their proration rate
multiplied by the total value of a buildings livable units. This
obviates the need for a fixed non-livable unit value.

* Fix spacing

* Boot multi-sales from training and ratio study

* Revert removing multi-PIN sales

* Add num parcels as a predictor

* Improve multi-sale inclusion heuristic

* Mark non-livable sales as outliers

* Use aggregate pct ownership for training data

* Update input data with multi-PIN heuristics

* Keep only the sale value attributable to livable unit

* Update input data with adjusted multi-PIN sales

* Add sales val kludge for multi-PIN sales

* Revert comment

* Update params based on 2024-02-11-gracious-manasi

* Clarify comments and conditionals

---------

Co-authored-by: Sweaty Handshake <william.ridgeway@cookcountyil.gov>
---
 dvc.lock             | 16 +++++------
 params.yaml          | 43 +++++++++++++-----------------
 pipeline/00-ingest.R | 63 ++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/dvc.lock b/dvc.lock
index 513cdd6..1744472 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -26,24 +26,24 @@ stages:
     outs:
     - path: input/assessment_data.parquet
       hash: md5
-      md5: ef6aa583f2e378e799fcc60a40744e32
-      size: 69004288
+      md5: 0b9e5b923038497f3a607d6e87496516
+      size: 69704229
     - path: input/char_data.parquet
       hash: md5
-      md5: 092ab08fa7918734eee564476674476f
-      size: 130626680
+      md5: 1d6456f84fb0b40ad2ed5483f8fc4cbd
+      size: 131100439
     - path: input/condo_strata_data.parquet
       hash: md5
-      md5: 2a47b43a7ee9a91c1bb57f65336bbf35
-      size: 40715
+      md5: 0e3a26f7482aac6b194f0b95beb6031c
+      size: 40733
     - path: input/land_nbhd_rate_data.parquet
       hash: md5
       md5: e508daf5790982c303d6503fe1cb8e2b
       size: 4413
     - path: input/training_data.parquet
       hash: md5
-      md5: 6e5bf1036e73193637b33df03bf53365
-      size: 62157666
+      md5: 8b9014a569e3adae773330ee48e14670
+      size: 65839056
   train:
     cmd: Rscript pipeline/01-train.R
     deps:
diff --git a/params.yaml b/params.yaml
index d09f46e..01f14ee 100644
--- a/params.yaml
+++ b/params.yaml
@@ -146,6 +146,7 @@ model:
     all:
       - "meta_township_code"
       - "meta_nbhd_code"
+      - "meta_tieback_proration_rate"
       - "char_yrblt"
       - "char_land_sf"
       - "char_building_units"
@@ -159,10 +160,7 @@ model:
       - "loc_longitude"
       - "loc_latitude"
       - "loc_census_tract_geoid"
-      - "loc_env_flood_fema_sfha"
       - "loc_env_flood_fs_factor"
-      - "loc_env_flood_fs_risk_direction"
-      - "loc_env_airport_noise_dnl"
       - "loc_school_elementary_district_geoid"
       - "loc_school_secondary_district_geoid"
       - "loc_access_cmap_walk_nta_score"
@@ -172,8 +170,6 @@ model:
       - "prox_num_bus_stop_in_half_mile"
       - "prox_num_foreclosure_per_1000_pin_past_5_years"
       - "prox_num_school_in_half_mile"
-      - "prox_num_school_with_rating_in_half_mile"
-      - "prox_avg_school_rating_in_half_mile"
       - "prox_airport_dnl_total"
       - "prox_nearest_bike_trail_dist_ft"
       - "prox_nearest_cemetery_dist_ft"
@@ -194,7 +190,6 @@ model:
       - "acs5_percent_age_children"
       - "acs5_percent_age_senior"
       - "acs5_median_age_total"
-      - "acs5_percent_mobility_no_move"
       - "acs5_percent_mobility_moved_from_other_state"
       - "acs5_percent_household_family_married"
       - "acs5_percent_household_nonfamily_alone"
@@ -212,8 +207,6 @@ model:
       - "acs5_percent_household_total_occupied_w_sel_cond"
       - "acs5_percent_mobility_moved_in_county"
       - "other_tax_bill_rate"
-      - "other_school_district_elementary_avg_rating"
-      - "other_school_district_secondary_avg_rating"
       - "ccao_is_active_exe_homeowner"
       - "ccao_is_corner_lot"
       - "ccao_n_years_exe_homeowner"
@@ -294,27 +287,27 @@ model:
       # for setting this parameter is to discover a good fixed value using CV,
       # then manually set that value for non-CV runs (which don't use
       # early stopping by default)
-      num_iterations: 2500
-      learning_rate: 0.015
+      num_iterations: 2275
+      learning_rate: 0.011
 
       # Maximum number of bins for discretizing continuous features. Lower uses
       # less memory and speeds up training
-      max_bin: 512
+      max_bin: 225
 
       # See docs for details on each of the remaining parameters:
       # https://lightgbm.readthedocs.io/en/latest/Parameters.html
-      num_leaves: 159
-      add_to_linked_depth: 1
-      feature_fraction: 0.688
-      min_gain_to_split: 5.58
+      num_leaves: 200
+      add_to_linked_depth: 2
+      feature_fraction: 0.661
+      min_gain_to_split: 1.58
       min_data_in_leaf: 44
-      max_cat_threshold: 228
-      min_data_per_group: 160
-      cat_smooth: 54.52
-      cat_l2: 0.11
-      lambda_l1: 0.016
-      lambda_l2: 2.413
-      neighbors: 5
+      max_cat_threshold: 87
+      min_data_per_group: 200
+      cat_smooth: 140.85
+      cat_l2: 0.017
+      lambda_l1: 0.697
+      lambda_l2: 0.002
+      neighbors: 15
 
     # Range of possible hyperparameter values for tuning to explore
     range:
@@ -355,9 +348,9 @@ pv:
 
   # Rounding settings to apply to initial predictions. Rounding is done to
   # indicate to property owners that model values are estimates, not exact
-  round_break: [1000, 10000, 100000]
-  round_to_nearest: [1, 500, 5000, 10000]
-  round_type: "floor"
+  round_break: [1000, 10000]
+  round_to_nearest: [1, 10, 100]
+  round_type: "ceiling"
 
 
 # Ratio Study ------------------------------------------------------------------
diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R
index 7e602bf..867d8d2 100644
--- a/pipeline/00-ingest.R
+++ b/pipeline/00-ingest.R
@@ -43,6 +43,7 @@ training_data <- dbGetQuery(
       sale.deed_type AS meta_sale_deed_type,
       sale.seller_name AS meta_sale_seller_name,
       sale.buyer_name AS meta_sale_buyer_name,
+      sale.num_parcels_sale AS meta_sale_num_parcels,
       sale.sv_is_outlier,
       sale.sv_outlier_type,
       condo.*
@@ -205,21 +206,59 @@ message("Adding time features and cleaning")
 # parking spot, but only the sale for the unit, not the parking. Drop other
 # multi-unit sale types since we don't have a way to disaggregate each
 # unit's value
-training_data <- training_data %>%
+training_data_ms <- training_data %>%
   group_by(meta_sale_document_num) %>%
-  arrange(meta_tieback_proration_rate) %>%
+  arrange(meta_sale_document_num, meta_tieback_proration_rate) %>%
   mutate(
+    # Attach sale to the condo UNIT if one of the PINs in the sale is a garage
+    # and the unit % of ownership is greater than 3x the garage % of ownership.
+    # The sum() call here ensures that one (and only one) PIN of the multi-sale
+    # is a garage unit
     keep_unit_sale =
-      meta_tieback_proration_rate >= (lag(meta_tieback_proration_rate) * 3)
+      meta_tieback_proration_rate >= (lag(meta_tieback_proration_rate) * 3) &
+        sum(meta_cdu == "GR", na.rm = TRUE) == 1, # nolint
+    # If there are multiple PINs associated with a sale, take only the
+    # proportion of the sale value that is attributable to the main unit (based
+    # on percentage of ownership)
+    total_proration_rate = sum(meta_tieback_proration_rate, na.rm = TRUE),
+    meta_sale_price = as.numeric(meta_sale_price),
+    meta_sale_price = ifelse(
+      n() == 2 & keep_unit_sale,
+      meta_sale_price * (meta_tieback_proration_rate / total_proration_rate),
+      meta_sale_price
+    ),
+    meta_sale_price = round(meta_sale_price, 0)
   ) %>%
   filter(n() == 1 | (n() == 2 & keep_unit_sale)) %>%
   ungroup() %>%
   filter(!as.logical(as.numeric(ind_pin_is_multilline))) %>%
-  select(-keep_unit_sale)
+  select(-keep_unit_sale, -total_proration_rate)
+
+# Multi-sale outlier detection / sales validation kludge. The main sales
+# validation logic cannot yet handle multi-sale properties, but they're a
+# significant minority of the total sales sample. We can borrow some
+# conservative thresholds from the main sales validation output to identify
+# likely non-arms-length sales. ONLY APPLIES to multi-sale properties
+training_data_fil <- training_data_ms %>%
+  mutate(
+    sv_outlier_type = case_when(
+      meta_sale_price < 50000 & meta_sale_num_parcels == 2 ~
+        "Low price (multi)",
+      meta_sale_price > 1700000 & meta_sale_num_parcels == 2 ~
+        "High price (multi)",
+      TRUE ~ sv_outlier_type
+    ),
+    sv_is_outlier = ifelse(
+      (meta_sale_price < 50000 & meta_sale_num_parcels == 2) |
+        (meta_sale_price > 1700000 & meta_sale_num_parcels == 2),
+      TRUE,
+      sv_is_outlier
+    )
+  )
 
 # Clean up the training data. Goal is to get it into a publishable format.
 # Final featurization, missingness, etc. is handled via Tidymodels recipes
-training_data_clean <- training_data %>%
+training_data_clean <- training_data_fil %>%
   # Recode factor variables using the definitions stored in ccao::vars_dict
   # This will remove any categories not stored in the dictionary and convert
   # them to NA (useful since there are a lot of misrecorded variables)
@@ -231,6 +270,20 @@ training_data_clean <- training_data %>%
     any_of(col_type_dict$var_name),
     ~ recode_column_type(.x, cur_column())
   )) %>%
+  mutate(
+    # Treat sales for non-livable spaces as outliers. They are included for
+    # reference only
+    sv_is_outlier = ifelse(
+      meta_modeling_group == "NONLIVABLE",
+      TRUE,
+      sv_is_outlier
+    ),
+    sv_outlier_type = ifelse(
+      meta_modeling_group == "NONLIVABLE",
+      "Non-livable area",
+      sv_outlier_type
+    )
+  ) %>%
   # Only exclude explicit outliers from training. Sales with missing validation
   # outcomes will be considered non-outliers
   mutate(