From d8fc724aa20a0e61b57b91791b69e14bab656b14 Mon Sep 17 00:00:00 2001 From: John Bollenbacher Date: Mon, 16 Dec 2024 10:43:12 -0500 Subject: [PATCH 1/3] task: trans_pipelines_proxy --- .../task_trans_pipelines_proxy.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 gch4i/proxy_processing/task_trans_pipelines_proxy.py diff --git a/gch4i/proxy_processing/task_trans_pipelines_proxy.py b/gch4i/proxy_processing/task_trans_pipelines_proxy.py new file mode 100644 index 0000000..d8c789e --- /dev/null +++ b/gch4i/proxy_processing/task_trans_pipelines_proxy.py @@ -0,0 +1,98 @@ +#%% +from pathlib import Path +from typing import Annotated + +import geopandas as gpd +import pandas as pd +import numpy as np +from pytask import Product, mark, task + +from gch4i.config import ( + max_year, + min_year, + proxy_data_dir_path, + sector_data_dir_path, + V3_DATA_PATH +) +from gch4i.utils import ( + us_state_to_abbrev_dict +) + +lower_48_and_DC = ('AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'IA', 'ID', 'IL', + 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', + 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', + 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'DC') + +@mark.persist +@task(id="trans_pipeline_proxy") +def get_trans_pipeline_proxy_data( + #Inputs + enverus_midstream_pipelines_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb", + + #Outputs + output_path: Annotated[Path, Product] = ( + proxy_data_dir_path / "trans_pipeline_proxy.parquet" + ), +): + ############################################################### + # Load data + ############################################################### + + # load state geometries for continental US + state_shapes = gpd.read_file(V3_DATA_PATH / 'geospatial/cb_2018_us_state_500k/cb_2018_us_state_500k.shp') + state_shapes = state_shapes.rename(columns={'STUSPS': 'state_code'})[['state_code', 'geometry']] + state_shapes = state_shapes[state_shapes['state_code'].isin(lower_48_and_DC)] + state_shapes = state_shapes.to_crs(epsg=4326) # Ensure state_shapes is in EPSG:4326 + + # load enverus pipelines geometries data + enverus_pipelines_gdf = gpd.read_file(enverus_midstream_pipelines_path, layer='NaturalGasPipelines') + enverus_pipelines_gdf = enverus_pipelines_gdf.to_crs(epsg=4326) # Ensure enverus_pipelines_gdf is in EPSG:4326 + + #drop unneeded pipelines and columns + # enverus_pipelines_gdf = enverus_pipelines_gdf[enverus_pipelines_gdf['CNTRY_NAME']=='United States'] #dropped, because not used in v2 and state intersection will filter this. + enverus_pipelines_gdf = enverus_pipelines_gdf[enverus_pipelines_gdf['STATUS']=='Operational'] + enverus_pipelines_gdf = enverus_pipelines_gdf[enverus_pipelines_gdf['TYPE'] =='Transmission'] + enverus_pipelines_gdf = enverus_pipelines_gdf[['LOC_ID','DIAMETER','geometry', #'INSTALL_YR', + ]].reset_index(drop=True) + + # Split pipelines at state boundaries and keep only the segments that fall within states + enverus_pipelines_gdf_split_by_state = gpd.overlay(enverus_pipelines_gdf, state_shapes, how='intersection', keep_geom_type=True) + + #create each year's proxy, and combine to make the full proxy table + # NOTE: this assumes the geometries do not change year to year, since we dont have good data for when each pipeline began operation. + year_gdfs = [] + for year in range(min_year, max_year+1): + year_gdf = enverus_pipelines_gdf_split_by_state.copy() + # year_gdf = year_gdf[year_gdf['INSTALL_YR'] > year] # not applicable since most INSTALL_YR are NaN + year_gdf['year'] = year + year_gdfs.append(year_gdf) + proxy_gdf = pd.concat(year_gdfs, ignore_index=True) + + #compute the length of each pipeline within each state + proxy_gdf = proxy_gdf.to_crs("ESRI:102003") # Convert to ESRI:102003 + proxy_gdf['pipeline_length_within_state'] = proxy_gdf.geometry.length + proxy_gdf = proxy_gdf.to_crs(epsg=4326) # Convert back to EPSG:4326 + + #get rel_emi + proxy_gdf = proxy_gdf.rename(columns = {'pipeline_length_within_state':'rel_emi'}) #simply assume rel_emi is proportional to length. + + ############################################################### + # Normalize and save + ############################################################### + + # Normalize relative emissions to sum to 1 for each year and state + proxy_gdf = proxy_gdf.groupby(['state_code', 'year']).filter(lambda x: x['rel_emi'].sum() > 0) #drop state-years with 0 total volume + proxy_gdf['rel_emi'] = proxy_gdf.groupby(['year', 'state_code'])['rel_emi'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) #normalize to sum to 1 + sums = proxy_gdf.groupby(["state_code", "year"])["rel_emi"].sum() #get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Save to parquet + proxy_gdf.to_parquet(output_path) + + return proxy_gdf + +# proxy_gdf = get_trans_pipeline_proxy_data() + +# #%% +# proxy_gdf.info() +# proxy_gdf From 69b8170c9cee1fd92999838ed8d5a677684e8dd6 Mon Sep 17 00:00:00 2001 From: John Bollenbacher Date: Wed, 1 Jan 2025 15:20:27 -0500 Subject: [PATCH 2/3] farm_pipelines_proxy From 9bb8b0cf5534f93d892625eb9edc25352beddf19 Mon Sep 17 00:00:00 2001 From: John Bollenbacher Date: Wed, 1 Jan 2025 15:22:22 -0500 Subject: [PATCH 3/3] task: farm_pipeline_proxy --- .../task_farm_pipelines_proxy.py | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 gch4i/proxy_processing/task_farm_pipelines_proxy.py diff --git a/gch4i/proxy_processing/task_farm_pipelines_proxy.py b/gch4i/proxy_processing/task_farm_pipelines_proxy.py new file mode 100644 index 0000000..2a5a3d3 --- /dev/null +++ b/gch4i/proxy_processing/task_farm_pipelines_proxy.py @@ -0,0 +1,208 @@ +#%% +from pathlib import Path +from typing import Annotated + +import geopandas as gpd +import pandas as pd +import numpy as np +from pytask import Product, mark, task +import rasterio +from rasterio.features import rasterize, shapes +from shapely.geometry import shape + +from gch4i.config import ( + max_year, + min_year, + proxy_data_dir_path, + sector_data_dir_path, + V3_DATA_PATH +) + +lower_48_and_DC = ('AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'IA', 'ID', 'IL', + 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', + 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', + 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'DC') + +verbose = False + +@mark.persist +@task(id="farm_pipeline_proxy") +def get_farm_pipeline_proxy_data( + #Inputs + enverus_midstream_pipelines_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb", + croplands_path_template: Path = sector_data_dir_path / "nass_cdl/{year}_30m_cdls_all_crop_binary.tif", + + #Outputs + output_path: Annotated[Path, Product] = proxy_data_dir_path / "farms_pipeline_proxy.parquet" +): + + ############################################################### + # Load and prepare pipeline data + ############################################################### + + if verbose: print('loading state and pipeline data') + + # Load state geometries and pipeline data (keeping existing code) + state_shapes = gpd.read_file(V3_DATA_PATH / 'geospatial/cb_2018_us_state_500k/cb_2018_us_state_500k.shp') + state_shapes = state_shapes.rename(columns={'STUSPS': 'state_code'})[['state_code', 'geometry']] + state_shapes = state_shapes[state_shapes['state_code'].isin(lower_48_and_DC)] + state_shapes = state_shapes.to_crs(epsg=4326) + + # Load and filter pipeline data + enverus_pipelines_gdf = gpd.read_file(enverus_midstream_pipelines_path, layer='NaturalGasPipelines') + enverus_pipelines_gdf = enverus_pipelines_gdf.to_crs(epsg=4326) + enverus_pipelines_gdf = enverus_pipelines_gdf[ + (enverus_pipelines_gdf['STATUS'] == 'Operational') & + (enverus_pipelines_gdf['TYPE'] == 'Transmission') + ][['LOC_ID', 'DIAMETER', 'geometry']].reset_index(drop=True) + + ############################################################### + # Make one year's proxy gdf + ############################################################### + + def process_year(year, enverus_pipelines_gdf, croplands_path, state_shapes): + + year_pipelines_gdf = enverus_pipelines_gdf.copy() + + ############################################################### + # Load cropland data and find raster intersection + ############################################################### + + if verbose: print(' loading cropland raster data') + with rasterio.open(croplands_path) as src: + # Read cropland data + cropland_mask = src.read(1) > 0 + transform = src.transform + raster_crs = src.crs + + #reproject pipelines to raster's CRS for raster operations + year_pipelines_gdf_raster_crs = year_pipelines_gdf.to_crs(raster_crs).copy() + + # Rasterize pipelines to same grid as cropland + if verbose: print(' rasterizing pipelines') + pipeline_shapes = ((geom, 1) for geom in year_pipelines_gdf_raster_crs.geometry) + pipeline_raster = rasterize( + pipeline_shapes, + out_shape=cropland_mask.shape, + transform=transform, + dtype=np.uint8 + ) + + # Find intersection areas + if verbose: print(' finding cropland-pipelines intersection') + intersection_mask = (cropland_mask & (pipeline_raster > 0)) + + ############################################################### + # Find vector intersection using raster intersection + ############################################################### + + # Vectorize only the intersection areas + if verbose: print(' vectorizing cropland raster bins which intersect pipeline bins') + intersection_shapes = shapes( + intersection_mask.astype(np.uint8), + transform=transform, + connectivity=4, + mask=intersection_mask + ) + intersection_geometries = [ + {"geometry": shape(geom), "properties": {"value": val}} + for geom, val in intersection_shapes if val == 1 + ] + + # Convert intersection shapes to GeoDataFrame + if verbose: print(' put intersecting cropland geometries into gdf') + if intersection_geometries: + intersection_gdf = gpd.GeoDataFrame.from_features( + intersection_geometries, + crs=raster_crs + ).to_crs(epsg=4326) + + # Intersect with original pipelines to maintain pipeline attributes + if verbose: print(' computing vector overlay between cropland and pipelines') + year_pipelines_gdf = gpd.overlay( + year_pipelines_gdf, + intersection_gdf, + how='intersection', + keep_geom_type=True + ) + + # Recombine each pipeline into single geometry + if verbose: print(' recombining split pipeline geometries') + year_pipelines_gdf = year_pipelines_gdf.dissolve( + by=['LOC_ID', 'DIAMETER'], #if rows match on all these columns, then combine their geometries into a single geometry in one row + aggfunc='first' #for other columns besides geometry and the matched columns, use the first row's values. there are no other columns, though. + ).reset_index() + + else: + # Create empty GeoDataFrame with same columns as enverus_pipelines_gdf + year_pipelines_gdf = gpd.GeoDataFrame( + columns=year_pipelines_gdf.columns, + geometry=[], + crs=year_pipelines_gdf.crs + ) + + ############################################################### + # Split by state and create vector proxy gdf + ############################################################### + + if verbose: print(' splitting by state') + + # Split pipelines at state boundaries + enverus_pipelines_gdf_split_by_state = gpd.overlay( + year_pipelines_gdf, + state_shapes, + how='intersection', + keep_geom_type=True + ) + + proxy_gdf = enverus_pipelines_gdf_split_by_state.copy() + proxy_gdf['year'] = year + + ############################################################### + # Calculate pipeline lengths and normalize rel_emi + ############################################################### + + if verbose: print(' computing pipeline lengths and final proxy gdf') + + # Calculate pipeline lengths + proxy_gdf = proxy_gdf.to_crs("ESRI:102003") + proxy_gdf['rel_emi'] = proxy_gdf.geometry.length + proxy_gdf = proxy_gdf.to_crs(epsg=4326) + + # Normalize relative emissions to sum to 1 for each year and state + proxy_gdf = proxy_gdf.groupby(['state_code', 'year']).filter(lambda x: x['rel_emi'].sum() > 0) #drop state-years with 0 total volume + proxy_gdf['rel_emi'] = proxy_gdf.groupby(['year', 'state_code'])['rel_emi'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) #normalize to sum to 1 + sums = proxy_gdf.groupby(["state_code", "year"])["rel_emi"].sum() #get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + return proxy_gdf + + + ############################################################### + # Process each year, aggregate, and save + ############################################################### + + year_proxy_gdfs = [] + for year in range(min_year, max_year+1): + if verbose: print(f'Processing year {year}') + croplands_path = croplands_path_template.parent / croplands_path_template.name.format(year=year) + year_proxy_gdf = process_year(year, enverus_pipelines_gdf, croplands_path, state_shapes) + year_proxy_gdfs.append(year_proxy_gdf) + + # Double check normalization + sums = proxy_gdf.groupby(["state_code", "year"])["rel_emi"].sum() #get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Combine all years into single dataframe + proxy_gdf = pd.concat(year_proxy_gdfs, ignore_index=True) + + # Save to parquet + proxy_gdf.to_parquet(output_path) + +# return proxy_gdf +# df = get_farm_pipeline_proxy_data() +# df.info() +# df + + +# %%