diff --git a/.gitignore b/.gitignore index 3a4727e..560567e 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ cache/ *.rds *.zip *.csv +!docs/data-dict.csv *.xlsx !condo_nonlivable_demo.xlsx *.xlsm diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3d9242f..8bbd70f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,3 +27,10 @@ repos: entry: Cannot commit .Rhistory, .RData, .Rds or .rds. language: fail files: '\.(Rhistory|RData|Rds|rds)$' + - id: check-data-dict + name: Data dictionary must be up to date with params file + entry: Rscript R/hooks/check-data-dict.R + files: (^|/)((params\.yaml)|(data-dict\.csv))$ + language: r + additional_dependencies: + - yaml diff --git a/R/hooks/check-data-dict.R b/R/hooks/check-data-dict.R new file mode 100644 index 0000000..80e8aee --- /dev/null +++ b/R/hooks/check-data-dict.R @@ -0,0 +1,34 @@ +#!/usr/bin/env Rscript +# Script to check that the data dictionary file is up to date with the +# latest feature set +library(yaml) + +params_filename <- "params.yaml" +data_dict_filename <- "docs/data-dict.csv" + +params <- read_yaml(params_filename) +data_dict <- read.csv(data_dict_filename) + +symmetric_diff <- c( + setdiff(data_dict$variable_name, params$model$predictor$all), + setdiff(params$model$predictor$all, data_dict$variable_name) +) +symmetric_diff_len <- length(symmetric_diff) + +if (symmetric_diff_len > 0) { + err_msg_prefix <- ifelse(symmetric_diff_len == 1, "Param is", "Params are") + err_msg <- paste0( + err_msg_prefix, + " not present in both ", + params_filename, + " and ", + data_dict_filename, + ": ", + paste(symmetric_diff, collapse = ", "), + ". ", + "Did you forget to reknit README.Rmd after updating ", + params_filename, + "?" + ) + stop(err_msg) +} diff --git a/README.Rmd b/README.Rmd index d8408e6..d491fac 100644 --- a/README.Rmd +++ b/README.Rmd @@ -60,13 +60,14 @@ We leverage these qualities to produce what we call ***strata***, a feature uniq ### Features Used -Because our individual condo unit characteristics are sparse and incomplete, we primarily must rely on aggregate geospatial features, economic features, [strata](#condo-strata), and time of sale to determine condo assessed values. The features in the table below are the ones used in the 2024 assessment model. +Because our individual condo unit characteristics are sparse and incomplete, we primarily must rely on aggregate geospatial features, economic features, [strata](#condo-strata), and time of sale to determine condo assessed values. The features in the table below are the ones used in the most recent assessment model. ```{r features_used, message=FALSE, echo=FALSE} library(dplyr) library(glue) library(jsonlite) library(purrr) +library(readr) library(tidyr) library(yaml) @@ -154,43 +155,62 @@ res_preds <- res_params$model$predictor$all condo_unique_preds <- setdiff(condo_preds$value, res_preds) -condo_preds %>% +condo_preds_fmt <- condo_preds %>% mutate(description = param_notes) %>% left_join( ccao::vars_dict, by = c("value" = "var_name_model") ) %>% distinct( - `Feature Name` = var_name_pretty, - Category = var_type, - Type = var_data_type, - Notes = description, - value + feature_name = var_name_pretty, + variable_name = value, + description, + category = var_type, + type = var_data_type ) %>% mutate( - Category = recode( - Category, + category = recode( + category, char = "Characteristic", acs5 = "ACS5", loc = "Location", prox = "Proximity", ind = "Indicator", time = "Time", - meta = "Meta", other = "Other", ccao = "Other" + meta = "Meta", other = "Other", ccao = "Other", shp = "Parcel Shape" ), - `Feature Name` = recode( - `Feature Name`, + feature_name = recode( + feature_name, "Tieback Proration Rate" = "Condominium % Ownership", "Year Built" = "Condominium Building Year Built" + ), + unique_to_condo_model = ifelse( + variable_name %in% condo_unique_preds | + feature_name %in% + c("Condominium Building Year Built", "Condominium % Ownership"), + TRUE, FALSE ) ) %>% - mutate(`Unique to Condo Model` = ifelse( - value %in% condo_unique_preds | - `Feature Name` %in% - c("Condominium Building Year Built", "Condominium % Ownership"), - "X", "" - )) %>% - arrange(desc(`Unique to Condo Model`), Category) %>% - select(-value) %>% + arrange(desc(unique_to_condo_model), category) + +condo_preds_fmt %>% + write_csv("docs/data-dict.csv") + +condo_preds_fmt %>% + mutate(unique_to_condo_model = ifelse(unique_to_condo_model, "X", "")) %>% + rename( + "Feature Name" = "feature_name", + "Variable Name" = "variable_name", + "Description" = "description", + "Category" = "category", + "Type" = "type", + "Unique to Condo Model" = "unique_to_condo_model" + ) %>% knitr::kable(format = "markdown") ``` +We maintain a few useful resources for working with these features: + +- Once you've [pulled the input data](#getting-data), you can inner join the data to the CSV version of the data dictionary ([`docs/data-dict.csv`](./docs/data-dict.csv)) to filter for only the features that we use in the model. +- You can browse our [data catalog](https://ccao-data.github.io/data-architecture/#!/overview) to see more details about these features, in particular the [condo model input view](https://ccao-data.github.io/data-architecture/#!/model/model.ccao_data_athena.model.vw_pin_condo_input) which is the source of our training data. +- You can use the [`ccao` R package](https://ccao-data.github.io/ccao/) or its [Python equivalent](https://ccao-data.github.io/ccao/python/) to programmatically convert variable names to their human-readable versions ([`ccao::vars_rename()`](https://ccao-data.github.io/ccao/reference/vars_rename.html)) or convert numerically-encoded variables to human-readable values ([`ccao::vars_recode()`](https://ccao-data.github.io/ccao/reference/vars_recode.html). The [`ccao::vars_dict` object](https://ccao-data.github.io/ccao/reference/vars_dict.html) is also useful for inspecting the raw crosswalk that powers the rename and recode functions. + ### Valuation For the most part, condos are valued the same way as single- and multi-family residential property. We [train a model](https://github.com/ccao-data/model-res-avm#how-it-works) using individual condo unit sales, predict the value of all units, and then apply any [post-modeling adjustment](https://github.com/ccao-data/model-res-avm#post-modeling). diff --git a/README.md b/README.md index 00ca073..b0c92d2 100644 --- a/README.md +++ b/README.md @@ -105,91 +105,117 @@ Because our individual condo unit characteristics are sparse and incomplete, we primarily must rely on aggregate geospatial features, economic features, [strata](#condo-strata), and time of sale to determine condo assessed values. The features in the table below are the -ones used in the 2024 assessment model. - -| Feature Name | Category | Type | Notes | Unique to Condo Model | -|:------------------------------------------------------------------------|:---------------|:----------|:------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------| -| Condominium Building Year Built | Characteristic | numeric | Year the property was constructed | X | -| Total Condominium Building Livable Parcels | Characteristic | numeric | Count of livable 14-digit PINs (AKA condo units) | X | -| Total Condominium Building Non-Livable Parcels | Characteristic | numeric | Count of non-livable 14-digit PINs | X | -| Condominium Building Is Mixed Use | Characteristic | logical | The 10-digit PIN (building) contains a 14-digit PIN that is neither class 299 nor 399 | X | -| Total Condominium Building Square Footage | Characteristic | numeric | Square footage of the *building* (PIN10) containing this unit | X | -| Building Square Footage | Characteristic | numeric | Square footage of the *building* (PIN10) containing this unit | X | -| Condominium Unit Square Footage | Characteristic | numeric | Square footage of the condominium unit associated with this PIN | X | -| Unit Square Footage | Characteristic | numeric | Square footage of the condominium unit associated with this PIN | X | -| Condominium Unit Bedrooms | Characteristic | numeric | Number of bedrooms in the building | X | -| Bedrooms | Characteristic | numeric | Number of bedrooms in the building | X | -| Condominium Unit Half Baths | Characteristic | numeric | Number of half baths | X | -| Half Baths | Characteristic | numeric | Number of half baths | X | -| Condominium Unit Full Baths | Characteristic | numeric | Number of full bathrooms | X | -| Full Baths | Characteristic | numeric | Number of full bathrooms | X | -| Condominium % Ownership | Meta | numeric | Proration rate applied to the PIN | X | -| Condominium Building Strata 1 | Meta | character | Condominium Building Strata - 10 Levels | X | -| Condominium Building Strata 2 | Meta | character | Condominium Building Strata - 100 Levels | X | -| Percent Population Age, Under 19 Years Old | ACS5 | numeric | Percent of the people 17 years or younger | | -| Percent Population Age, Over 65 Years Old | ACS5 | numeric | Percent of the people 65 years or older | | -| Median Population Age | ACS5 | numeric | Median age for whole population | | -| Percent Population Mobility, Moved From Other State in Past Year | ACS5 | numeric | Percent of people (older than 1 year) who moved from another state in the past 12 months | | -| Percent Households Family, Married | ACS5 | numeric | Percent of households that are family, married | | -| Percent Households Nonfamily, Living Alone | ACS5 | numeric | Percent of households that are non-family, alone (single) | | -| Percent Population Education, High School Degree | ACS5 | numeric | Percent of people older than 25 who attained a high school degree | | -| Percent Population Education, Bachelor Degree | ACS5 | numeric | Percent of people older than 25 who attained a bachelor’s degree | | -| Percent Population Education, Graduate Degree | ACS5 | numeric | Percent of people older than 25 who attained a graduate degree | | -| Percent Population Income, Below Poverty Level | ACS5 | numeric | Percent of people above the poverty level in the last 12 months | | -| Median Income, Household in Past Year | ACS5 | numeric | Median income per household in the past 12 months | | -| Median Income, Per Capita in Past Year | ACS5 | numeric | Median income per capita in the past 12 months | | -| Percent Population Income, Received SNAP in Past Year | ACS5 | numeric | Percent of households that received SNAP in the past 12 months | | -| Percent Population Employment, Unemployed | ACS5 | numeric | Percent of people 16 years and older unemployed | | -| Median Occupied Household, Total, Year Built | ACS5 | numeric | Median year built for all occupied households | | -| Median Occupied Household, Renter, Gross Rent | ACS5 | numeric | Median gross rent for only renter-occupied units | | -| Percent Occupied Households, Owner | ACS5 | numeric | Percent of households that are owner-occupied | | -| Percent Occupied Households, Total, One or More Selected Conditions | ACS5 | numeric | Percent of occupied households with selected conditions | | -| Percent Population Mobility, Moved From Within Same County in Past Year | ACS5 | numeric | Percent of people (older than 1 year) who moved in county in the past 12 months | | -| Land Square Feet | Characteristic | numeric | Square footage of the land (not just the building) of the property | | -| Longitude | Location | numeric | X coordinate in degrees (global longitude) | | -| Latitude | Location | numeric | Y coordinate in degrees (global latitude) | | -| Census Tract GEOID | Location | character | 11-digit ACS/Census tract GEOID | | -| First Street Factor | Location | numeric | First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk | | -| School Elementary District GEOID | Location | character | School district (elementary) GEOID | | -| School Secondary District GEOID | Location | character | School district (secondary) GEOID | | -| CMAP Walkability Score (No Transit) | Location | numeric | CMAP walkability score for a given PIN, excluding transit walkability | | -| CMAP Walkability Total Score | Location | numeric | CMAP walkability score for a given PIN, including transit walkability | | -| Municipality Name | Location | character | Taxing district name, as seen on Cook County tax bills | | -| Township Code | Meta | character | Cook County township code | | -| Neighborhood Code | Meta | character | Assessor neighborhood code | | -| Property Tax Bill Aggregate Rate | Other | numeric | Tax bill rate for the taxing district containing a given PIN | | -| Active Homeowner Exemption | Other | logical | Parcel has an active homeowner exemption | | -| Corner Lot | Other | logical | Corner lot indicator | | -| Number of Years Active Homeowner Exemption | Other | numeric | Number of years parcel has had an active homeowner exemption | | -| Number of PINs in Half Mile | Proximity | numeric | Number of PINs within half mile | | -| Number of Bus Stops in Half Mile | Proximity | numeric | Number of bus stops within half mile | | -| Number of Foreclosures Per 1000 PINs (Past 5 Years) | Proximity | numeric | Number of foreclosures per 1000 PINs, within half mile (past 5 years) | | -| Number of Schools in Half Mile | Proximity | numeric | Number of schools (any kind) within half mile | | -| Total Airport Noise DNL | Proximity | numeric | Estimated DNL for a PIN, assuming a baseline DNL of 50 (“quiet suburban”) and adding predicted noise from O’Hare and Midway airports to that baseline | | -| Nearest Bike Trail Distance (Feet) | Proximity | numeric | Nearest bike trail distance (feet) | | -| Nearest Cemetery Distance (Feet) | Proximity | numeric | Nearest cemetery distance (feet) | | -| Nearest CTA Route Distance (Feet) | Proximity | numeric | Nearest CTA route distance (feet) | | -| Nearest CTA Stop Distance (Feet) | Proximity | numeric | Nearest CTA stop distance (feet) | | -| Nearest Hospital Distance (Feet) | Proximity | numeric | Nearest hospital distance (feet) | | -| Lake Michigan Distance (Feet) | Proximity | numeric | Distance to Lake Michigan shoreline (feet) | | -| Nearest Major Road Distance (Feet) | Proximity | numeric | Nearest major road distance (feet) | | -| Nearest Metra Route Distance (Feet) | Proximity | numeric | Nearest Metra route distance (feet) | | -| Nearest Metra Stop Distance (Feet) | Proximity | numeric | Nearest Metra stop distance (feet) | | -| Nearest Park Distance (Feet) | Proximity | numeric | Nearest park distance (feet) | | -| Nearest Railroad Distance (Feet) | Proximity | numeric | Nearest railroad distance (feet) | | -| Nearest Secondary Road Distance (Feet) | Proximity | numeric | Nearest secondary road distance (feet) | | -| Nearest University Distance (Feet) | Proximity | numeric | Nearest university distance (feet) | | -| Nearest Vacant Land Parcel Distance (Feet) | Proximity | numeric | Nearest vacant land (class 100) parcel distance (feet) | | -| Nearest Water Distance (Feet) | Proximity | numeric | Nearest water distance (feet) | | -| Nearest Golf Course Distance (Feet) | Proximity | numeric | Nearest golf course distance (feet) | | -| Sale Year | Time | numeric | Sale year calculated as the number of years since 0 B.C.E | | -| Sale Day | Time | numeric | Sale day calculated as the number of days since January 1st, 1997 | | -| Sale Quarter of Year | Time | character | Character encoding of quarter of year (Q1 - Q4) | | -| Sale Month of Year | Time | character | Character encoding of month of year (Jan - Dec) | | -| Sale Day of Year | Time | numeric | Numeric encoding of day of year (1 - 365) | | -| Sale Day of Month | Time | numeric | Numeric encoding of day of month (1 - 31) | | -| Sale Day of Week | Time | numeric | Numeric encoding of day of week (1 - 7) | | -| Sale After COVID-19 | Time | logical | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | | +ones used in the most recent assessment model. + +| Feature Name | Variable Name | Description | Category | Type | Unique to Condo Model | +|:----------------------------------------------------------------------------|:------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------|:----------|:----------------------| +| Condominium Building Year Built | char_yrblt | Year the property was constructed | Characteristic | numeric | X | +| Total Condominium Building Livable Parcels | char_building_units | Count of livable 14-digit PINs (AKA condo units) | Characteristic | numeric | X | +| Total Condominium Building Non-Livable Parcels | char_building_non_units | Count of non-livable 14-digit PINs | Characteristic | numeric | X | +| Condominium Building Is Mixed Use | char_bldg_is_mixed_use | The 10-digit PIN (building) contains a 14-digit PIN that is neither class 299 nor 399 | Characteristic | logical | X | +| Total Condominium Building Square Footage | char_building_sf | Square footage of the *building* (PIN10) containing this unit | Characteristic | numeric | X | +| Condominium Unit Square Footage | char_unit_sf | Square footage of the condominium unit associated with this PIN | Characteristic | numeric | X | +| Condominium Unit Bedrooms | char_bedrooms | Number of bedrooms in the building | Characteristic | numeric | X | +| Condominium Unit Half Baths | char_half_baths | Number of half baths | Characteristic | numeric | X | +| Condominium Unit Full Baths | char_full_baths | Number of full bathrooms | Characteristic | numeric | X | +| Condominium % Ownership | meta_tieback_proration_rate | Proration rate applied to the PIN | Meta | numeric | X | +| Condominium Building Strata 1 | meta_strata_1 | Condominium Building Strata - 10 Levels | Meta | character | X | +| Condominium Building Strata 2 | meta_strata_2 | Condominium Building Strata - 100 Levels | Meta | character | X | +| Standard Deviation Distance From Parcel Centroid to Vertices (Feet) | shp_parcel_centroid_dist_ft_sd | Standard deviation of the distance from each major parcel vertex to the parcel centroid | Parcel Shape | numeric | X | +| Standard Deviation Parcel Edge Length (Feet) | shp_parcel_edge_len_ft_sd | Standard deviation of the edge length between parcel vertices | Parcel Shape | numeric | X | +| Standard Deviation Parcel Interior Angle (Degrees) | shp_parcel_interior_angle_sd | Standard deviation of the interior angles of the parcel polygon | Parcel Shape | numeric | X | +| Ratio of Parcel Area to Minimum Rotated Bounding Rectangle | shp_parcel_mrr_area_ratio | Ratio of the parcel’s area to the area of its minimum rotated bounding rectangle | Parcel Shape | numeric | X | +| Ratio of Parcel Minimum Rotated Bounding Rectangle Longest to Shortest Side | shp_parcel_mrr_side_ratio | Ratio of the longest to the shortest side of the parcel’s minimum rotated bounding rectangle | Parcel Shape | numeric | X | +| Number of Parcel Vertices | shp_parcel_num_vertices | The number of vertices of the parcel | Parcel Shape | numeric | X | +| Nearest Highway Distance (Feet) | prox_nearest_road_highway_dist_ft | Distance to nearest highway road | Proximity | numeric | X | +| Nearest Arterial Road Distance (Feet) | prox_nearest_road_arterial_dist_ft | Distance to nearest arterial road | Proximity | numeric | X | +| Nearest Collector Road Distance (Feet) | prox_nearest_road_collector_dist_ft | Distance to nearest collector road | Proximity | numeric | X | +| Average Daily Traffic Count on Nearest Highway | prox_nearest_road_highway_daily_traffic | Daily traffic of nearest highway road | Proximity | numeric | X | +| Average Daily Traffic Count on Nearest Arterial Road | prox_nearest_road_arterial_daily_traffic | Daily traffic of nearest arterial road | Proximity | numeric | X | +| Average Daily Traffic Count on Nearest Collector Road | prox_nearest_road_collector_daily_traffic | Daily traffic of nearest collector road | Proximity | numeric | X | +| Nearest New Construction (Feet) | prox_nearest_new_construction_dist_ft | Nearest new construction distance (feet) | Proximity | numeric | X | +| Nearest Major Stadium (Feet) | prox_nearest_stadium_dist_ft | Nearest stadium distance (feet) | Proximity | numeric | X | +| Percent Population Age, Under 19 Years Old | acs5_percent_age_children | Percent of the people 17 years or younger | ACS5 | numeric | | +| Percent Population Age, Over 65 Years Old | acs5_percent_age_senior | Percent of the people 65 years or older | ACS5 | numeric | | +| Median Population Age | acs5_median_age_total | Median age for whole population | ACS5 | numeric | | +| Percent Households Family, Married | acs5_percent_household_family_married | Percent of households that are family, married | ACS5 | numeric | | +| Percent Households Nonfamily, Living Alone | acs5_percent_household_nonfamily_alone | Percent of households that are non-family, alone (single) | ACS5 | numeric | | +| Percent Population Education, High School Degree | acs5_percent_education_high_school | Percent of people older than 25 who attained a high school degree | ACS5 | numeric | | +| Percent Population Education, Bachelor Degree | acs5_percent_education_bachelor | Percent of people older than 25 who attained a bachelor’s degree | ACS5 | numeric | | +| Percent Population Education, Graduate Degree | acs5_percent_education_graduate | Percent of people older than 25 who attained a graduate degree | ACS5 | numeric | | +| Percent Population Income, Below Poverty Level | acs5_percent_income_below_poverty_level | Percent of people above the poverty level in the last 12 months | ACS5 | numeric | | +| Median Income, Household in Past Year | acs5_median_income_household_past_year | Median income per household in the past 12 months | ACS5 | numeric | | +| Median Income, Per Capita in Past Year | acs5_median_income_per_capita_past_year | Median income per capita in the past 12 months | ACS5 | numeric | | +| Percent Population Income, Received SNAP in Past Year | acs5_percent_income_household_received_snap_past_year | Percent of households that received SNAP in the past 12 months | ACS5 | numeric | | +| Percent Population Employment, Unemployed | acs5_percent_employment_unemployed | Percent of people 16 years and older unemployed | ACS5 | numeric | | +| Median Occupied Household, Total, Year Built | acs5_median_household_total_occupied_year_built | Median year built for all occupied households | ACS5 | numeric | | +| Median Occupied Household, Renter, Gross Rent | acs5_median_household_renter_occupied_gross_rent | Median gross rent for only renter-occupied units | ACS5 | numeric | | +| Percent Occupied Households, Owner | acs5_percent_household_owner_occupied | Percent of households that are owner-occupied | ACS5 | numeric | | +| Land Square Feet | char_land_sf | Square footage of the land (not just the building) of the property | Characteristic | numeric | | +| Longitude | loc_longitude | X coordinate in degrees (global longitude) | Location | numeric | | +| Latitude | loc_latitude | Y coordinate in degrees (global latitude) | Location | numeric | | +| Census Tract GEOID | loc_census_tract_geoid | 11-digit ACS/Census tract GEOID | Location | character | | +| First Street Factor | loc_env_flood_fs_factor | First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk | Location | numeric | | +| School Elementary District GEOID | loc_school_elementary_district_geoid | School district (elementary) GEOID | Location | character | | +| School Secondary District GEOID | loc_school_secondary_district_geoid | School district (secondary) GEOID | Location | character | | +| CMAP Walkability Score (No Transit) | loc_access_cmap_walk_nta_score | CMAP walkability score for a given PIN, excluding transit walkability | Location | numeric | | +| CMAP Walkability Total Score | loc_access_cmap_walk_total_score | CMAP walkability score for a given PIN, including transit walkability | Location | numeric | | +| Municipality Name | loc_tax_municipality_name | Taxing district name, as seen on Cook County tax bills | Location | character | | +| Township Code | meta_township_code | Cook County township code | Meta | character | | +| Neighborhood Code | meta_nbhd_code | Assessor neighborhood code | Meta | character | | +| Property Tax Bill Aggregate Rate | other_tax_bill_rate | Tax bill rate for the taxing district containing a given PIN | Other | numeric | | +| Active Homeowner Exemption | ccao_is_active_exe_homeowner | Parcel has an active homeowner exemption | Other | logical | | +| Number of Years Active Homeowner Exemption | ccao_n_years_exe_homeowner | Number of years parcel has had an active homeowner exemption | Other | numeric | | +| Number of PINs in Half Mile | prox_num_pin_in_half_mile | Number of PINs within half mile | Proximity | numeric | | +| Number of Bus Stops in Half Mile | prox_num_bus_stop_in_half_mile | Number of bus stops within half mile | Proximity | numeric | | +| Number of Foreclosures Per 1000 PINs (Past 5 Years) | prox_num_foreclosure_per_1000_pin_past_5_years | Number of foreclosures per 1000 PINs, within half mile (past 5 years) | Proximity | numeric | | +| Total Airport Noise DNL | prox_airport_dnl_total | Estimated DNL for a PIN, assuming a baseline DNL of 50 (“quiet suburban”) and adding predicted noise from O’Hare and Midway airports to that baseline | Proximity | numeric | | +| Nearest Bike Trail Distance (Feet) | prox_nearest_bike_trail_dist_ft | Nearest bike trail distance (feet) | Proximity | numeric | | +| Nearest Cemetery Distance (Feet) | prox_nearest_cemetery_dist_ft | Nearest cemetery distance (feet) | Proximity | numeric | | +| Nearest CTA Route Distance (Feet) | prox_nearest_cta_route_dist_ft | Nearest CTA route distance (feet) | Proximity | numeric | | +| Nearest CTA Stop Distance (Feet) | prox_nearest_cta_stop_dist_ft | Nearest CTA stop distance (feet) | Proximity | numeric | | +| Nearest Hospital Distance (Feet) | prox_nearest_hospital_dist_ft | Nearest hospital distance (feet) | Proximity | numeric | | +| Lake Michigan Distance (Feet) | prox_lake_michigan_dist_ft | Distance to Lake Michigan shoreline (feet) | Proximity | numeric | | +| Nearest Metra Route Distance (Feet) | prox_nearest_metra_route_dist_ft | Nearest Metra route distance (feet) | Proximity | numeric | | +| Nearest Metra Stop Distance (Feet) | prox_nearest_metra_stop_dist_ft | Nearest Metra stop distance (feet) | Proximity | numeric | | +| Nearest Park Distance (Feet) | prox_nearest_park_dist_ft | Nearest park distance (feet) | Proximity | numeric | | +| Nearest Railroad Distance (Feet) | prox_nearest_railroad_dist_ft | Nearest railroad distance (feet) | Proximity | numeric | | +| Nearest University Distance (Feet) | prox_nearest_university_dist_ft | Nearest university distance (feet) | Proximity | numeric | | +| Nearest Vacant Land Parcel Distance (Feet) | prox_nearest_vacant_land_dist_ft | Nearest vacant land (class 100) parcel distance (feet) | Proximity | numeric | | +| Nearest Water Distance (Feet) | prox_nearest_water_dist_ft | Nearest water distance (feet) | Proximity | numeric | | +| Nearest Golf Course Distance (Feet) | prox_nearest_golf_course_dist_ft | Nearest golf course distance (feet) | Proximity | numeric | | +| Sale Year | time_sale_year | Sale year calculated as the number of years since 0 B.C.E | Time | numeric | | +| Sale Day | time_sale_day | Sale day calculated as the number of days since January 1st, 1997 | Time | numeric | | +| Sale Quarter of Year | time_sale_quarter_of_year | Character encoding of quarter of year (Q1 - Q4) | Time | character | | +| Sale Month of Year | time_sale_month_of_year | Character encoding of month of year (Jan - Dec) | Time | character | | +| Sale Day of Year | time_sale_day_of_year | Numeric encoding of day of year (1 - 365) | Time | numeric | | +| Sale Day of Month | time_sale_day_of_month | Numeric encoding of day of month (1 - 31) | Time | numeric | | +| Sale Day of Week | time_sale_day_of_week | Numeric encoding of day of week (1 - 7) | Time | numeric | | +| Sale After COVID-19 | time_sale_post_covid | Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020) | Time | logical | | + +We maintain a few useful resources for working with these features: + +- Once you’ve [pulled the input data](#getting-data), you can inner join + the data to the CSV version of the data dictionary + ([`docs/data-dict.csv`](./docs/data-dict.csv)) to filter for only the + features that we use in the model. +- You can browse our [data + catalog](https://ccao-data.github.io/data-architecture/#!/overview) to + see more details about these features, in particular the [condo model + input + view](https://ccao-data.github.io/data-architecture/#!/model/model.ccao_data_athena.model.vw_pin_condo_input) + which is the source of our training data. +- You can use the [`ccao` R package](https://ccao-data.github.io/ccao/) + or its [Python equivalent](https://ccao-data.github.io/ccao/python/) + to programmatically convert variable names to their human-readable + versions + ([`ccao::vars_rename()`](https://ccao-data.github.io/ccao/reference/vars_rename.html)) + or convert numerically-encoded variables to human-readable values + ([`ccao::vars_recode()`](https://ccao-data.github.io/ccao/reference/vars_recode.html). + The [`ccao::vars_dict` + object](https://ccao-data.github.io/ccao/reference/vars_dict.html) is + also useful for inspecting the raw crosswalk that powers the rename + and recode functions. ### Valuation diff --git a/docs/data-dict.csv b/docs/data-dict.csv new file mode 100644 index 0000000..fd8cbac --- /dev/null +++ b/docs/data-dict.csv @@ -0,0 +1,84 @@ +feature_name,variable_name,description,category,type,unique_to_condo_model +Condominium Building Year Built,char_yrblt,Year the property was constructed,Characteristic,numeric,TRUE +Total Condominium Building Livable Parcels,char_building_units,Count of livable 14-digit PINs (AKA condo units),Characteristic,numeric,TRUE +Total Condominium Building Non-Livable Parcels,char_building_non_units,Count of non-livable 14-digit PINs,Characteristic,numeric,TRUE +Condominium Building Is Mixed Use,char_bldg_is_mixed_use,The 10-digit PIN (building) contains a 14-digit PIN that is neither class 299 nor 399,Characteristic,logical,TRUE +Total Condominium Building Square Footage,char_building_sf,Square footage of the _building_ (PIN10) containing this unit,Characteristic,numeric,TRUE +Condominium Unit Square Footage,char_unit_sf,Square footage of the condominium unit associated with this PIN,Characteristic,numeric,TRUE +Condominium Unit Bedrooms,char_bedrooms,Number of bedrooms in the building,Characteristic,numeric,TRUE +Condominium Unit Half Baths,char_half_baths,Number of half baths,Characteristic,numeric,TRUE +Condominium Unit Full Baths,char_full_baths,Number of full bathrooms,Characteristic,numeric,TRUE +Condominium % Ownership,meta_tieback_proration_rate,Proration rate applied to the PIN,Meta,numeric,TRUE +Condominium Building Strata 1,meta_strata_1,Condominium Building Strata - 10 Levels,Meta,character,TRUE +Condominium Building Strata 2,meta_strata_2,Condominium Building Strata - 100 Levels,Meta,character,TRUE +Standard Deviation Distance From Parcel Centroid to Vertices (Feet),shp_parcel_centroid_dist_ft_sd,Standard deviation of the distance from each major parcel vertex to the parcel centroid,Parcel Shape,numeric,TRUE +Standard Deviation Parcel Edge Length (Feet),shp_parcel_edge_len_ft_sd,Standard deviation of the edge length between parcel vertices,Parcel Shape,numeric,TRUE +Standard Deviation Parcel Interior Angle (Degrees),shp_parcel_interior_angle_sd,Standard deviation of the interior angles of the parcel polygon,Parcel Shape,numeric,TRUE +Ratio of Parcel Area to Minimum Rotated Bounding Rectangle,shp_parcel_mrr_area_ratio,Ratio of the parcel's area to the area of its minimum rotated bounding rectangle,Parcel Shape,numeric,TRUE +Ratio of Parcel Minimum Rotated Bounding Rectangle Longest to Shortest Side,shp_parcel_mrr_side_ratio,Ratio of the longest to the shortest side of the parcel's minimum rotated bounding rectangle,Parcel Shape,numeric,TRUE +Number of Parcel Vertices,shp_parcel_num_vertices,The number of vertices of the parcel,Parcel Shape,numeric,TRUE +Nearest Highway Distance (Feet),prox_nearest_road_highway_dist_ft,Distance to nearest highway road,Proximity,numeric,TRUE +Nearest Arterial Road Distance (Feet),prox_nearest_road_arterial_dist_ft,Distance to nearest arterial road,Proximity,numeric,TRUE +Nearest Collector Road Distance (Feet),prox_nearest_road_collector_dist_ft,Distance to nearest collector road,Proximity,numeric,TRUE +Average Daily Traffic Count on Nearest Highway,prox_nearest_road_highway_daily_traffic,Daily traffic of nearest highway road,Proximity,numeric,TRUE +Average Daily Traffic Count on Nearest Arterial Road,prox_nearest_road_arterial_daily_traffic,Daily traffic of nearest arterial road,Proximity,numeric,TRUE +Average Daily Traffic Count on Nearest Collector Road,prox_nearest_road_collector_daily_traffic,Daily traffic of nearest collector road,Proximity,numeric,TRUE +Nearest New Construction (Feet),prox_nearest_new_construction_dist_ft,Nearest new construction distance (feet),Proximity,numeric,TRUE +Nearest Major Stadium (Feet),prox_nearest_stadium_dist_ft,Nearest stadium distance (feet),Proximity,numeric,TRUE +"Percent Population Age, Under 19 Years Old",acs5_percent_age_children,Percent of the people 17 years or younger,ACS5,numeric,FALSE +"Percent Population Age, Over 65 Years Old",acs5_percent_age_senior,Percent of the people 65 years or older,ACS5,numeric,FALSE +Median Population Age,acs5_median_age_total,Median age for whole population,ACS5,numeric,FALSE +"Percent Households Family, Married",acs5_percent_household_family_married,"Percent of households that are family, married",ACS5,numeric,FALSE +"Percent Households Nonfamily, Living Alone",acs5_percent_household_nonfamily_alone,"Percent of households that are non-family, alone (single)",ACS5,numeric,FALSE +"Percent Population Education, High School Degree",acs5_percent_education_high_school,Percent of people older than 25 who attained a high school degree,ACS5,numeric,FALSE +"Percent Population Education, Bachelor Degree",acs5_percent_education_bachelor,Percent of people older than 25 who attained a bachelor's degree,ACS5,numeric,FALSE +"Percent Population Education, Graduate Degree",acs5_percent_education_graduate,Percent of people older than 25 who attained a graduate degree,ACS5,numeric,FALSE +"Percent Population Income, Below Poverty Level",acs5_percent_income_below_poverty_level,Percent of people above the poverty level in the last 12 months,ACS5,numeric,FALSE +"Median Income, Household in Past Year",acs5_median_income_household_past_year,Median income per household in the past 12 months,ACS5,numeric,FALSE +"Median Income, Per Capita in Past Year",acs5_median_income_per_capita_past_year,Median income per capita in the past 12 months,ACS5,numeric,FALSE +"Percent Population Income, Received SNAP in Past Year",acs5_percent_income_household_received_snap_past_year,Percent of households that received SNAP in the past 12 months,ACS5,numeric,FALSE +"Percent Population Employment, Unemployed",acs5_percent_employment_unemployed,Percent of people 16 years and older unemployed,ACS5,numeric,FALSE +"Median Occupied Household, Total, Year Built",acs5_median_household_total_occupied_year_built,Median year built for all occupied households,ACS5,numeric,FALSE +"Median Occupied Household, Renter, Gross Rent",acs5_median_household_renter_occupied_gross_rent,Median gross rent for only renter-occupied units,ACS5,numeric,FALSE +"Percent Occupied Households, Owner",acs5_percent_household_owner_occupied,Percent of households that are owner-occupied,ACS5,numeric,FALSE +Land Square Feet,char_land_sf,Square footage of the land (not just the building) of the property,Characteristic,numeric,FALSE +Longitude,loc_longitude,X coordinate in degrees (global longitude),Location,numeric,FALSE +Latitude,loc_latitude,Y coordinate in degrees (global latitude),Location,numeric,FALSE +Census Tract GEOID,loc_census_tract_geoid,11-digit ACS/Census tract GEOID,Location,character,FALSE +First Street Factor,loc_env_flood_fs_factor,"First Street flood factor The flood factor is a risk score, where 10 is the highest risk and 1 is the lowest risk",Location,numeric,FALSE +School Elementary District GEOID,loc_school_elementary_district_geoid,School district (elementary) GEOID,Location,character,FALSE +School Secondary District GEOID,loc_school_secondary_district_geoid,School district (secondary) GEOID,Location,character,FALSE +CMAP Walkability Score (No Transit),loc_access_cmap_walk_nta_score,"CMAP walkability score for a given PIN, excluding transit walkability",Location,numeric,FALSE +CMAP Walkability Total Score,loc_access_cmap_walk_total_score,"CMAP walkability score for a given PIN, including transit walkability",Location,numeric,FALSE +Municipality Name,loc_tax_municipality_name,"Taxing district name, as seen on Cook County tax bills",Location,character,FALSE +Township Code,meta_township_code,Cook County township code,Meta,character,FALSE +Neighborhood Code,meta_nbhd_code,Assessor neighborhood code,Meta,character,FALSE +Property Tax Bill Aggregate Rate,other_tax_bill_rate,Tax bill rate for the taxing district containing a given PIN,Other,numeric,FALSE +Active Homeowner Exemption,ccao_is_active_exe_homeowner,Parcel has an active homeowner exemption,Other,logical,FALSE +Number of Years Active Homeowner Exemption,ccao_n_years_exe_homeowner,Number of years parcel has had an active homeowner exemption,Other,numeric,FALSE +Number of PINs in Half Mile,prox_num_pin_in_half_mile,Number of PINs within half mile,Proximity,numeric,FALSE +Number of Bus Stops in Half Mile,prox_num_bus_stop_in_half_mile,Number of bus stops within half mile,Proximity,numeric,FALSE +Number of Foreclosures Per 1000 PINs (Past 5 Years),prox_num_foreclosure_per_1000_pin_past_5_years,"Number of foreclosures per 1000 PINs, within half mile (past 5 years)",Proximity,numeric,FALSE +Total Airport Noise DNL,prox_airport_dnl_total,"Estimated DNL for a PIN, assuming a baseline DNL of 50 (""quiet suburban"") and adding predicted noise from O'Hare and Midway airports to that baseline",Proximity,numeric,FALSE +Nearest Bike Trail Distance (Feet),prox_nearest_bike_trail_dist_ft,Nearest bike trail distance (feet),Proximity,numeric,FALSE +Nearest Cemetery Distance (Feet),prox_nearest_cemetery_dist_ft,Nearest cemetery distance (feet),Proximity,numeric,FALSE +Nearest CTA Route Distance (Feet),prox_nearest_cta_route_dist_ft,Nearest CTA route distance (feet),Proximity,numeric,FALSE +Nearest CTA Stop Distance (Feet),prox_nearest_cta_stop_dist_ft,Nearest CTA stop distance (feet),Proximity,numeric,FALSE +Nearest Hospital Distance (Feet),prox_nearest_hospital_dist_ft,Nearest hospital distance (feet),Proximity,numeric,FALSE +Lake Michigan Distance (Feet),prox_lake_michigan_dist_ft,Distance to Lake Michigan shoreline (feet),Proximity,numeric,FALSE +Nearest Metra Route Distance (Feet),prox_nearest_metra_route_dist_ft,Nearest Metra route distance (feet),Proximity,numeric,FALSE +Nearest Metra Stop Distance (Feet),prox_nearest_metra_stop_dist_ft,Nearest Metra stop distance (feet),Proximity,numeric,FALSE +Nearest Park Distance (Feet),prox_nearest_park_dist_ft,Nearest park distance (feet),Proximity,numeric,FALSE +Nearest Railroad Distance (Feet),prox_nearest_railroad_dist_ft,Nearest railroad distance (feet),Proximity,numeric,FALSE +Nearest University Distance (Feet),prox_nearest_university_dist_ft,Nearest university distance (feet),Proximity,numeric,FALSE +Nearest Vacant Land Parcel Distance (Feet),prox_nearest_vacant_land_dist_ft,Nearest vacant land (class 100) parcel distance (feet),Proximity,numeric,FALSE +Nearest Water Distance (Feet),prox_nearest_water_dist_ft,Nearest water distance (feet),Proximity,numeric,FALSE +Nearest Golf Course Distance (Feet),prox_nearest_golf_course_dist_ft,Nearest golf course distance (feet),Proximity,numeric,FALSE +Sale Year,time_sale_year,Sale year calculated as the number of years since 0 B.C.E,Time,numeric,FALSE +Sale Day,time_sale_day,"Sale day calculated as the number of days since January 1st, 1997",Time,numeric,FALSE +Sale Quarter of Year,time_sale_quarter_of_year,Character encoding of quarter of year (Q1 - Q4),Time,character,FALSE +Sale Month of Year,time_sale_month_of_year,Character encoding of month of year (Jan - Dec),Time,character,FALSE +Sale Day of Year,time_sale_day_of_year,Numeric encoding of day of year (1 - 365),Time,numeric,FALSE +Sale Day of Month,time_sale_day_of_month,Numeric encoding of day of month (1 - 31),Time,numeric,FALSE +Sale Day of Week,time_sale_day_of_week,Numeric encoding of day of week (1 - 7),Time,numeric,FALSE +Sale After COVID-19,time_sale_post_covid,"Indicator for whether sale occurred after COVID-19 was widely publicized (around March 15, 2020)",Time,logical,FALSE diff --git a/renv.lock b/renv.lock index daec5a2..216734e 100644 --- a/renv.lock +++ b/renv.lock @@ -279,11 +279,11 @@ "Version": "1.3.0", "Source": "GitHub", "RemoteType": "github", - "RemoteHost": "api.github.com", "RemoteUsername": "ccao-data", "RemoteRepo": "ccao", "RemoteRef": "master", - "RemoteSha": "8b6f53e14c1732fcec5f6982fbc4bfb32f45f194", + "RemoteSha": "a5449b9717323de3c51ee1948e4431175d6ccda0", + "RemoteHost": "api.github.com", "Requirements": [ "R", "assessr", @@ -292,7 +292,7 @@ "rlang", "tidyr" ], - "Hash": "1663306aa228ded9892f07d65ec20db3" + "Hash": "c29ff9f60bde4f122b8fe0bb75f02e8c" }, "class": { "Package": "class",