From b7e0cc40148aad5fde772c819287620b78bcfcc5 Mon Sep 17 00:00:00 2001 From: Dan Snow <31494343+dfsnow@users.noreply.github.com> Date: Thu, 15 Feb 2024 10:46:25 -0600 Subject: [PATCH] Update training data sales sample and outlier flags (#30) * Re-add proration rate/pct ownership as predictor * Drop underperforming features Based on mean absolute SHAP value * Re-add building mixed use indicator * Add grouping variables to arrange() * Update input data with fixed indicators * Re-add meta_modeling_group as a predictor * Drop nonlivable sales from training data * Value nonlivable spaces with model + cap * Value nonlivables as a function of per ass and total livable value * Pass aggregation correct object name * Pass aggregation correct value name * Add min bound to condo preds * Drop extra comma in pmax() * Revert parameter changes * Revert assessment stage changes * Exclude nonlivable spaces from training, ratio study * Improve commenting * Revert rounding settings to match 2023 * Update condo value apportionment methodology This commit changes the valuation methodology for non-livable units (parking, storage, etc.). It pegs their value to their proration rate multiplied by the total value of a buildings livable units. This obviates the need for a fixed non-livable unit value. * Update condo value apportionment methodology This commit changes the valuation methodology for non-livable units (parking, storage, etc.). It pegs their value to their proration rate multiplied by the total value of a buildings livable units. This obviates the need for a fixed non-livable unit value. * Fix spacing * Boot multi-sales from training and ratio study * Revert removing multi-PIN sales * Add num parcels as a predictor * Improve multi-sale inclusion heuristic * Mark non-livable sales as outliers * Use aggregate pct ownership for training data * Update input data with multi-PIN heuristics * Keep only the sale value attributable to livable unit * Update input data with adjusted multi-PIN sales * Add sales val kludge for multi-PIN sales * Revert comment * Update params based on 2024-02-11-gracious-manasi * Clarify comments and conditionals --------- Co-authored-by: Sweaty Handshake --- dvc.lock | 16 +++++------ params.yaml | 43 +++++++++++++----------------- pipeline/00-ingest.R | 63 ++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 84 insertions(+), 38 deletions(-) diff --git a/dvc.lock b/dvc.lock index 513cdd6..1744472 100644 --- a/dvc.lock +++ b/dvc.lock @@ -26,24 +26,24 @@ stages: outs: - path: input/assessment_data.parquet hash: md5 - md5: ef6aa583f2e378e799fcc60a40744e32 - size: 69004288 + md5: 0b9e5b923038497f3a607d6e87496516 + size: 69704229 - path: input/char_data.parquet hash: md5 - md5: 092ab08fa7918734eee564476674476f - size: 130626680 + md5: 1d6456f84fb0b40ad2ed5483f8fc4cbd + size: 131100439 - path: input/condo_strata_data.parquet hash: md5 - md5: 2a47b43a7ee9a91c1bb57f65336bbf35 - size: 40715 + md5: 0e3a26f7482aac6b194f0b95beb6031c + size: 40733 - path: input/land_nbhd_rate_data.parquet hash: md5 md5: e508daf5790982c303d6503fe1cb8e2b size: 4413 - path: input/training_data.parquet hash: md5 - md5: 6e5bf1036e73193637b33df03bf53365 - size: 62157666 + md5: 8b9014a569e3adae773330ee48e14670 + size: 65839056 train: cmd: Rscript pipeline/01-train.R deps: diff --git a/params.yaml b/params.yaml index d09f46e..01f14ee 100644 --- a/params.yaml +++ b/params.yaml @@ -146,6 +146,7 @@ model: all: - "meta_township_code" - "meta_nbhd_code" + - "meta_tieback_proration_rate" - "char_yrblt" - "char_land_sf" - "char_building_units" @@ -159,10 +160,7 @@ model: - "loc_longitude" - "loc_latitude" - "loc_census_tract_geoid" - - "loc_env_flood_fema_sfha" - "loc_env_flood_fs_factor" - - "loc_env_flood_fs_risk_direction" - - "loc_env_airport_noise_dnl" - "loc_school_elementary_district_geoid" - "loc_school_secondary_district_geoid" - "loc_access_cmap_walk_nta_score" @@ -172,8 +170,6 @@ model: - "prox_num_bus_stop_in_half_mile" - "prox_num_foreclosure_per_1000_pin_past_5_years" - "prox_num_school_in_half_mile" - - "prox_num_school_with_rating_in_half_mile" - - "prox_avg_school_rating_in_half_mile" - "prox_airport_dnl_total" - "prox_nearest_bike_trail_dist_ft" - "prox_nearest_cemetery_dist_ft" @@ -194,7 +190,6 @@ model: - "acs5_percent_age_children" - "acs5_percent_age_senior" - "acs5_median_age_total" - - "acs5_percent_mobility_no_move" - "acs5_percent_mobility_moved_from_other_state" - "acs5_percent_household_family_married" - "acs5_percent_household_nonfamily_alone" @@ -212,8 +207,6 @@ model: - "acs5_percent_household_total_occupied_w_sel_cond" - "acs5_percent_mobility_moved_in_county" - "other_tax_bill_rate" - - "other_school_district_elementary_avg_rating" - - "other_school_district_secondary_avg_rating" - "ccao_is_active_exe_homeowner" - "ccao_is_corner_lot" - "ccao_n_years_exe_homeowner" @@ -294,27 +287,27 @@ model: # for setting this parameter is to discover a good fixed value using CV, # then manually set that value for non-CV runs (which don't use # early stopping by default) - num_iterations: 2500 - learning_rate: 0.015 + num_iterations: 2275 + learning_rate: 0.011 # Maximum number of bins for discretizing continuous features. Lower uses # less memory and speeds up training - max_bin: 512 + max_bin: 225 # See docs for details on each of the remaining parameters: # https://lightgbm.readthedocs.io/en/latest/Parameters.html - num_leaves: 159 - add_to_linked_depth: 1 - feature_fraction: 0.688 - min_gain_to_split: 5.58 + num_leaves: 200 + add_to_linked_depth: 2 + feature_fraction: 0.661 + min_gain_to_split: 1.58 min_data_in_leaf: 44 - max_cat_threshold: 228 - min_data_per_group: 160 - cat_smooth: 54.52 - cat_l2: 0.11 - lambda_l1: 0.016 - lambda_l2: 2.413 - neighbors: 5 + max_cat_threshold: 87 + min_data_per_group: 200 + cat_smooth: 140.85 + cat_l2: 0.017 + lambda_l1: 0.697 + lambda_l2: 0.002 + neighbors: 15 # Range of possible hyperparameter values for tuning to explore range: @@ -355,9 +348,9 @@ pv: # Rounding settings to apply to initial predictions. Rounding is done to # indicate to property owners that model values are estimates, not exact - round_break: [1000, 10000, 100000] - round_to_nearest: [1, 500, 5000, 10000] - round_type: "floor" + round_break: [1000, 10000] + round_to_nearest: [1, 10, 100] + round_type: "ceiling" # Ratio Study ------------------------------------------------------------------ diff --git a/pipeline/00-ingest.R b/pipeline/00-ingest.R index 7e602bf..867d8d2 100644 --- a/pipeline/00-ingest.R +++ b/pipeline/00-ingest.R @@ -43,6 +43,7 @@ training_data <- dbGetQuery( sale.deed_type AS meta_sale_deed_type, sale.seller_name AS meta_sale_seller_name, sale.buyer_name AS meta_sale_buyer_name, + sale.num_parcels_sale AS meta_sale_num_parcels, sale.sv_is_outlier, sale.sv_outlier_type, condo.* @@ -205,21 +206,59 @@ message("Adding time features and cleaning") # parking spot, but only the sale for the unit, not the parking. Drop other # multi-unit sale types since we don't have a way to disaggregate each # unit's value -training_data <- training_data %>% +training_data_ms <- training_data %>% group_by(meta_sale_document_num) %>% - arrange(meta_tieback_proration_rate) %>% + arrange(meta_sale_document_num, meta_tieback_proration_rate) %>% mutate( + # Attach sale to the condo UNIT if one of the PINs in the sale is a garage + # and the unit % of ownership is greater than 3x the garage % of ownership. + # The sum() call here ensures that one (and only one) PIN of the multi-sale + # is a garage unit keep_unit_sale = - meta_tieback_proration_rate >= (lag(meta_tieback_proration_rate) * 3) + meta_tieback_proration_rate >= (lag(meta_tieback_proration_rate) * 3) & + sum(meta_cdu == "GR", na.rm = TRUE) == 1, # nolint + # If there are multiple PINs associated with a sale, take only the + # proportion of the sale value that is attributable to the main unit (based + # on percentage of ownership) + total_proration_rate = sum(meta_tieback_proration_rate, na.rm = TRUE), + meta_sale_price = as.numeric(meta_sale_price), + meta_sale_price = ifelse( + n() == 2 & keep_unit_sale, + meta_sale_price * (meta_tieback_proration_rate / total_proration_rate), + meta_sale_price + ), + meta_sale_price = round(meta_sale_price, 0) ) %>% filter(n() == 1 | (n() == 2 & keep_unit_sale)) %>% ungroup() %>% filter(!as.logical(as.numeric(ind_pin_is_multilline))) %>% - select(-keep_unit_sale) + select(-keep_unit_sale, -total_proration_rate) + +# Multi-sale outlier detection / sales validation kludge. The main sales +# validation logic cannot yet handle multi-sale properties, but they're a +# significant minority of the total sales sample. We can borrow some +# conservative thresholds from the main sales validation output to identify +# likely non-arms-length sales. ONLY APPLIES to multi-sale properties +training_data_fil <- training_data_ms %>% + mutate( + sv_outlier_type = case_when( + meta_sale_price < 50000 & meta_sale_num_parcels == 2 ~ + "Low price (multi)", + meta_sale_price > 1700000 & meta_sale_num_parcels == 2 ~ + "High price (multi)", + TRUE ~ sv_outlier_type + ), + sv_is_outlier = ifelse( + (meta_sale_price < 50000 & meta_sale_num_parcels == 2) | + (meta_sale_price > 1700000 & meta_sale_num_parcels == 2), + TRUE, + sv_is_outlier + ) + ) # Clean up the training data. Goal is to get it into a publishable format. # Final featurization, missingness, etc. is handled via Tidymodels recipes -training_data_clean <- training_data %>% +training_data_clean <- training_data_fil %>% # Recode factor variables using the definitions stored in ccao::vars_dict # This will remove any categories not stored in the dictionary and convert # them to NA (useful since there are a lot of misrecorded variables) @@ -231,6 +270,20 @@ training_data_clean <- training_data %>% any_of(col_type_dict$var_name), ~ recode_column_type(.x, cur_column()) )) %>% + mutate( + # Treat sales for non-livable spaces as outliers. They are included for + # reference only + sv_is_outlier = ifelse( + meta_modeling_group == "NONLIVABLE", + TRUE, + sv_is_outlier + ), + sv_outlier_type = ifelse( + meta_modeling_group == "NONLIVABLE", + "Non-livable area", + sv_outlier_type + ) + ) %>% # Only exclude explicit outliers from training. Sales with missing validation # outcomes will be considered non-outliers mutate(