Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update land ingest script #708

Merged
merged 12 commits into from
Jan 15, 2025
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
library(arrow)
library(aws.s3)
library(ccao)
library(dplyr)
library(noctua)
library(openxlsx)
Expand Down Expand Up @@ -29,6 +30,9 @@ remote_file_raw_nbhd_rate_2023 <- file.path(
remote_file_raw_nbhd_rate_2024 <- file.path(
input_bucket, "nbhd_rate", "2024.xlsx"
)
remote_file_raw_nbhd_rate_2025 <- file.path(
input_bucket, "nbhd_rate", "2025.xlsx"
)
remote_file_warehouse_nbhd_rate <- file.path(
output_bucket, "land_nbhd_rate"
)
Expand All @@ -38,6 +42,7 @@ remote_file_warehouse_nbhd_rate <- file.path(
tmp_file_nbhd_rate_2022 <- tempfile(fileext = ".xlsx")
tmp_file_nbhd_rate_2023 <- tempfile(fileext = ".xlsx")
tmp_file_nbhd_rate_2024 <- tempfile(fileext = ".xlsx")
tmp_file_nbhd_rate_2025 <- tempfile(fileext = ".xlsx")

# Grab the workbook from the raw S3 bucket
aws.s3::save_object(
Expand All @@ -52,6 +57,10 @@ aws.s3::save_object(
object = remote_file_raw_nbhd_rate_2024,
file = tmp_file_nbhd_rate_2024
)
aws.s3::save_object(
object = remote_file_raw_nbhd_rate_2025,
file = tmp_file_nbhd_rate_2025
)

# List of regression classes
class <- dbGetQuery(
Expand All @@ -77,7 +86,8 @@ land_nbhd_rate_2022 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2022) %>%
mutate(
across(c(township_code:town_nbhd, year), as.character),
town_nbhd = str_remove_all(town_nbhd, "-"),
land_rate_per_sqft = parse_number(land_rate_per_sqft)
land_rate_per_sqft = parse_number(land_rate_per_sqft),
data_year = "2022"
) %>%
expand_grid(class)

Expand All @@ -98,7 +108,9 @@ land_nbhd_rate_2023 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2023) %>%
c(`2020`, `2023`),
names_to = "year", values_to = "land_rate_per_sqft"
) %>%
mutate(across(c(township_code:town_nbhd, year), as.character)) %>%
mutate(across(c(township_code:town_nbhd, year), as.character),
data_year = "2023"
) %>%
expand_grid(class)

land_nbhd_rate_2024 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2024) %>%
Expand All @@ -124,7 +136,9 @@ land_nbhd_rate_2024 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2024) %>%
c(`2021`, `2024`),
names_to = "year", values_to = "land_rate_per_sqft"
) %>%
mutate(across(c(township_code:town_nbhd, year), as.character)) %>%
mutate(across(c(township_code:town_nbhd, year), as.character),
data_year = "2024"
) %>%
expand_grid(class) %>%
# 2024 contains bifurcated neighborhood land rates across class
filter(
Expand All @@ -133,15 +147,60 @@ land_nbhd_rate_2024 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2024) %>%
) %>%
select(-classes)

land_nbhd_rate_2025 <- openxlsx::read.xlsx(tmp_file_nbhd_rate_2025) %>%
set_names(snakecase::to_snake_case(names(.))) %>%
select(
town_nbhd = twp_nbhd,
classes = bifurcated_rate,
`2022` = "2022_rate",
`2025` = "proposed_2025_rate"
) %>%
mutate(
town_nbhd = gsub("\\D", "", town_nbhd),
township_code = substr(town_nbhd, 1, 2),
township_name = ccao::town_convert(township_code),
`2025` = as.character(`2025`)
) %>%
relocate(c(township_code, township_name)) %>%
pivot_longer(
c(`2022`, `2025`),
names_to = "year", values_to = "land_rate_per_sqft"
) %>%
mutate(across(c(township_code:town_nbhd, year), as.character)) %>%
# Value for NBHD 35100 is ALL EX in 2022
filter(land_rate_per_sqft != "All EX") %>%
# Re-codes to NA with warning
mutate(
land_rate_per_sqft = as.numeric(land_rate_per_sqft),
data_year = "2025"
) %>%
expand_grid(class) %>%
# 2025 contains bifurcated neighborhood land rates across class
# Make sure to change the text in future years, since they are not static.
filter(
!(classes == "All Other Res. " & class %in% c("210", "295")),
!(classes == "2-10/2-95 Rate" & !(class %in% c("210", "295")))
) %>%
select(-classes)

# Write the rates to S3, partitioned by year
bind_rows(
land_nbhd_rate_2022,
land_nbhd_rate_2023,
land_nbhd_rate_2024
land_nbhd_rate_2024,
land_nbhd_rate_2025
) %>%
relocate(land_rate_per_sqft, .after = last_col()) %>%
mutate(loaded_at = as.character(Sys.time())) %>%
group_by(year) %>%
# Since the files contain data for the current assessment year
# and the previous assessment for the same tri (three years prior)
# data will duplicate over time.
# Data is not exactly identical; the notable exception for 2025
# was that a handful of neighborhoods were removed from the file.
# But, we prioritize the most recent dataset.
filter(data_year == max(data_year)) %>%
select(-data_year) %>%
arrow::write_dataset(
path = remote_file_warehouse_nbhd_rate,
format = "parquet",
Expand Down
Loading