-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
natural gas pipelines proxies #33
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
#%% | ||
from pathlib import Path | ||
from typing import Annotated | ||
|
||
import geopandas as gpd | ||
import pandas as pd | ||
import numpy as np | ||
from pytask import Product, mark, task | ||
import rasterio | ||
from rasterio.features import rasterize, shapes | ||
from shapely.geometry import shape | ||
|
||
from gch4i.config import ( | ||
max_year, | ||
min_year, | ||
proxy_data_dir_path, | ||
sector_data_dir_path, | ||
V3_DATA_PATH | ||
) | ||
|
||
lower_48_and_DC = ('AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'IA', 'ID', 'IL', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we grab the keys from the |
||
'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', | ||
'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', | ||
'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'DC') | ||
|
||
verbose = False | ||
|
||
@mark.persist | ||
@task(id="farm_pipeline_proxy") | ||
def get_farm_pipeline_proxy_data( | ||
#Inputs | ||
enverus_midstream_pipelines_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb", | ||
croplands_path_template: Path = sector_data_dir_path / "nass_cdl/{year}_30m_cdls_all_crop_binary.tif", | ||
|
||
#Outputs | ||
output_path: Annotated[Path, Product] = proxy_data_dir_path / "farms_pipeline_proxy.parquet" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove s from farms -> |
||
): | ||
|
||
############################################################### | ||
# Load and prepare pipeline data | ||
############################################################### | ||
|
||
if verbose: print('loading state and pipeline data') | ||
|
||
# Load state geometries and pipeline data (keeping existing code) | ||
state_shapes = gpd.read_file(V3_DATA_PATH / 'geospatial/cb_2018_us_state_500k/cb_2018_us_state_500k.shp') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move this path into the function parameters so it is clear this file is being used in the proxy creation? |
||
state_shapes = state_shapes.rename(columns={'STUSPS': 'state_code'})[['state_code', 'geometry']] | ||
state_shapes = state_shapes[state_shapes['state_code'].isin(lower_48_and_DC)] | ||
state_shapes = state_shapes.to_crs(epsg=4326) | ||
|
||
# Load and filter pipeline data | ||
enverus_pipelines_gdf = gpd.read_file(enverus_midstream_pipelines_path, layer='NaturalGasPipelines') | ||
enverus_pipelines_gdf = enverus_pipelines_gdf.to_crs(epsg=4326) | ||
enverus_pipelines_gdf = enverus_pipelines_gdf[ | ||
(enverus_pipelines_gdf['STATUS'] == 'Operational') & | ||
(enverus_pipelines_gdf['TYPE'] == 'Transmission') | ||
][['LOC_ID', 'DIAMETER', 'geometry']].reset_index(drop=True) | ||
|
||
############################################################### | ||
# Make one year's proxy gdf | ||
############################################################### | ||
|
||
def process_year(year, enverus_pipelines_gdf, croplands_path, state_shapes): | ||
|
||
year_pipelines_gdf = enverus_pipelines_gdf.copy() | ||
|
||
############################################################### | ||
# Load cropland data and find raster intersection | ||
############################################################### | ||
|
||
if verbose: print(' loading cropland raster data') | ||
with rasterio.open(croplands_path) as src: | ||
# Read cropland data | ||
cropland_mask = src.read(1) > 0 | ||
transform = src.transform | ||
raster_crs = src.crs | ||
|
||
#reproject pipelines to raster's CRS for raster operations | ||
year_pipelines_gdf_raster_crs = year_pipelines_gdf.to_crs(raster_crs).copy() | ||
|
||
# Rasterize pipelines to same grid as cropland | ||
if verbose: print(' rasterizing pipelines') | ||
pipeline_shapes = ((geom, 1) for geom in year_pipelines_gdf_raster_crs.geometry) | ||
pipeline_raster = rasterize( | ||
pipeline_shapes, | ||
out_shape=cropland_mask.shape, | ||
transform=transform, | ||
dtype=np.uint8 | ||
) | ||
|
||
# Find intersection areas | ||
if verbose: print(' finding cropland-pipelines intersection') | ||
intersection_mask = (cropland_mask & (pipeline_raster > 0)) | ||
|
||
############################################################### | ||
# Find vector intersection using raster intersection | ||
############################################################### | ||
|
||
# Vectorize only the intersection areas | ||
if verbose: print(' vectorizing cropland raster bins which intersect pipeline bins') | ||
intersection_shapes = shapes( | ||
intersection_mask.astype(np.uint8), | ||
transform=transform, | ||
connectivity=4, | ||
mask=intersection_mask | ||
) | ||
intersection_geometries = [ | ||
{"geometry": shape(geom), "properties": {"value": val}} | ||
for geom, val in intersection_shapes if val == 1 | ||
] | ||
|
||
# Convert intersection shapes to GeoDataFrame | ||
if verbose: print(' put intersecting cropland geometries into gdf') | ||
if intersection_geometries: | ||
intersection_gdf = gpd.GeoDataFrame.from_features( | ||
intersection_geometries, | ||
crs=raster_crs | ||
).to_crs(epsg=4326) | ||
|
||
# Intersect with original pipelines to maintain pipeline attributes | ||
if verbose: print(' computing vector overlay between cropland and pipelines') | ||
year_pipelines_gdf = gpd.overlay( | ||
year_pipelines_gdf, | ||
intersection_gdf, | ||
how='intersection', | ||
keep_geom_type=True | ||
) | ||
|
||
# Recombine each pipeline into single geometry | ||
if verbose: print(' recombining split pipeline geometries') | ||
year_pipelines_gdf = year_pipelines_gdf.dissolve( | ||
by=['LOC_ID', 'DIAMETER'], #if rows match on all these columns, then combine their geometries into a single geometry in one row | ||
aggfunc='first' #for other columns besides geometry and the matched columns, use the first row's values. there are no other columns, though. | ||
).reset_index() | ||
|
||
else: | ||
# Create empty GeoDataFrame with same columns as enverus_pipelines_gdf | ||
year_pipelines_gdf = gpd.GeoDataFrame( | ||
columns=year_pipelines_gdf.columns, | ||
geometry=[], | ||
crs=year_pipelines_gdf.crs | ||
) | ||
|
||
############################################################### | ||
# Split by state and create vector proxy gdf | ||
############################################################### | ||
|
||
if verbose: print(' splitting by state') | ||
|
||
# Split pipelines at state boundaries | ||
enverus_pipelines_gdf_split_by_state = gpd.overlay( | ||
year_pipelines_gdf, | ||
state_shapes, | ||
how='intersection', | ||
keep_geom_type=True | ||
) | ||
|
||
proxy_gdf = enverus_pipelines_gdf_split_by_state.copy() | ||
proxy_gdf['year'] = year | ||
|
||
############################################################### | ||
# Calculate pipeline lengths and normalize rel_emi | ||
############################################################### | ||
|
||
if verbose: print(' computing pipeline lengths and final proxy gdf') | ||
|
||
# Calculate pipeline lengths | ||
proxy_gdf = proxy_gdf.to_crs("ESRI:102003") | ||
proxy_gdf['rel_emi'] = proxy_gdf.geometry.length | ||
proxy_gdf = proxy_gdf.to_crs(epsg=4326) | ||
|
||
# Normalize relative emissions to sum to 1 for each year and state | ||
proxy_gdf = proxy_gdf.groupby(['state_code', 'year']).filter(lambda x: x['rel_emi'].sum() > 0) #drop state-years with 0 total volume | ||
proxy_gdf['rel_emi'] = proxy_gdf.groupby(['year', 'state_code'])['rel_emi'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) #normalize to sum to 1 | ||
sums = proxy_gdf.groupby(["state_code", "year"])["rel_emi"].sum() #get sums to check normalization | ||
assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 | ||
|
||
return proxy_gdf | ||
|
||
|
||
############################################################### | ||
# Process each year, aggregate, and save | ||
############################################################### | ||
|
||
year_proxy_gdfs = [] | ||
for year in range(min_year, max_year+1): | ||
if verbose: print(f'Processing year {year}') | ||
croplands_path = croplands_path_template.parent / croplands_path_template.name.format(year=year) | ||
year_proxy_gdf = process_year(year, enverus_pipelines_gdf, croplands_path, state_shapes) | ||
year_proxy_gdfs.append(year_proxy_gdf) | ||
|
||
# Double check normalization | ||
sums = proxy_gdf.groupby(["state_code", "year"])["rel_emi"].sum() #get sums to check normalization | ||
assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 | ||
|
||
# Combine all years into single dataframe | ||
proxy_gdf = pd.concat(year_proxy_gdfs, ignore_index=True) | ||
|
||
# Save to parquet | ||
proxy_gdf.to_parquet(output_path) | ||
|
||
# return proxy_gdf | ||
# df = get_farm_pipeline_proxy_data() | ||
# df.info() | ||
# df | ||
|
||
|
||
# %% |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#%% | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want to add a header here before it goes to main? |
||
from pathlib import Path | ||
from typing import Annotated | ||
|
||
import geopandas as gpd | ||
import pandas as pd | ||
import numpy as np | ||
from pytask import Product, mark, task | ||
|
||
from gch4i.config import ( | ||
max_year, | ||
min_year, | ||
proxy_data_dir_path, | ||
sector_data_dir_path, | ||
V3_DATA_PATH | ||
) | ||
from gch4i.utils import ( | ||
us_state_to_abbrev_dict | ||
) | ||
|
||
lower_48_and_DC = ('AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'IA', 'ID', 'IL', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can use values from |
||
'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', | ||
'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', | ||
'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'DC') | ||
|
||
@mark.persist | ||
@task(id="trans_pipeline_proxy") | ||
def get_trans_pipeline_proxy_data( | ||
#Inputs | ||
enverus_midstream_pipelines_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb", | ||
|
||
#Outputs | ||
output_path: Annotated[Path, Product] = ( | ||
proxy_data_dir_path / "trans_pipeline_proxy.parquet" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ADD "s" to pipelines -> |
||
), | ||
): | ||
############################################################### | ||
# Load data | ||
############################################################### | ||
|
||
# load state geometries for continental US | ||
state_shapes = gpd.read_file(V3_DATA_PATH / 'geospatial/cb_2018_us_state_500k/cb_2018_us_state_500k.shp') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please move this path into the parameters for the function so we can see it is being used. |
||
state_shapes = state_shapes.rename(columns={'STUSPS': 'state_code'})[['state_code', 'geometry']] | ||
state_shapes = state_shapes[state_shapes['state_code'].isin(lower_48_and_DC)] | ||
state_shapes = state_shapes.to_crs(epsg=4326) # Ensure state_shapes is in EPSG:4326 | ||
|
||
# load enverus pipelines geometries data | ||
enverus_pipelines_gdf = gpd.read_file(enverus_midstream_pipelines_path, layer='NaturalGasPipelines') | ||
enverus_pipelines_gdf = enverus_pipelines_gdf.to_crs(epsg=4326) # Ensure enverus_pipelines_gdf is in EPSG:4326 | ||
|
||
#drop unneeded pipelines and columns | ||
# enverus_pipelines_gdf = enverus_pipelines_gdf[enverus_pipelines_gdf['CNTRY_NAME']=='United States'] #dropped, because not used in v2 and state intersection will filter this. | ||
enverus_pipelines_gdf = enverus_pipelines_gdf[enverus_pipelines_gdf['STATUS']=='Operational'] | ||
enverus_pipelines_gdf = enverus_pipelines_gdf[enverus_pipelines_gdf['TYPE'] =='Transmission'] | ||
enverus_pipelines_gdf = enverus_pipelines_gdf[['LOC_ID','DIAMETER','geometry', #'INSTALL_YR', | ||
]].reset_index(drop=True) | ||
|
||
# Split pipelines at state boundaries and keep only the segments that fall within states | ||
enverus_pipelines_gdf_split_by_state = gpd.overlay(enverus_pipelines_gdf, state_shapes, how='intersection', keep_geom_type=True) | ||
|
||
#create each year's proxy, and combine to make the full proxy table | ||
# NOTE: this assumes the geometries do not change year to year, since we dont have good data for when each pipeline began operation. | ||
year_gdfs = [] | ||
for year in range(min_year, max_year+1): | ||
year_gdf = enverus_pipelines_gdf_split_by_state.copy() | ||
# year_gdf = year_gdf[year_gdf['INSTALL_YR'] > year] # not applicable since most INSTALL_YR are NaN | ||
year_gdf['year'] = year | ||
year_gdfs.append(year_gdf) | ||
proxy_gdf = pd.concat(year_gdfs, ignore_index=True) | ||
|
||
#compute the length of each pipeline within each state | ||
proxy_gdf = proxy_gdf.to_crs("ESRI:102003") # Convert to ESRI:102003 | ||
proxy_gdf['pipeline_length_within_state'] = proxy_gdf.geometry.length | ||
proxy_gdf = proxy_gdf.to_crs(epsg=4326) # Convert back to EPSG:4326 | ||
|
||
#get rel_emi | ||
proxy_gdf = proxy_gdf.rename(columns = {'pipeline_length_within_state':'rel_emi'}) #simply assume rel_emi is proportional to length. | ||
|
||
############################################################### | ||
# Normalize and save | ||
############################################################### | ||
|
||
# Normalize relative emissions to sum to 1 for each year and state | ||
proxy_gdf = proxy_gdf.groupby(['state_code', 'year']).filter(lambda x: x['rel_emi'].sum() > 0) #drop state-years with 0 total volume | ||
proxy_gdf['rel_emi'] = proxy_gdf.groupby(['year', 'state_code'])['rel_emi'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) #normalize to sum to 1 | ||
sums = proxy_gdf.groupby(["state_code", "year"])["rel_emi"].sum() #get sums to check normalization | ||
assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 | ||
|
||
# Save to parquet | ||
proxy_gdf.to_parquet(output_path) | ||
|
||
return proxy_gdf | ||
|
||
# proxy_gdf = get_trans_pipeline_proxy_data() | ||
|
||
# #%% | ||
# proxy_gdf.info() | ||
# proxy_gdf |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you want to add the header here before it goes to main?