Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

natural gas pipelines proxies #33

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 208 additions & 0 deletions gch4i/proxy_processing/task_farm_pipelines_proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
#%%
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to add the header here before it goes to main?

from pathlib import Path
from typing import Annotated

import geopandas as gpd
import pandas as pd
import numpy as np
from pytask import Product, mark, task
import rasterio
from rasterio.features import rasterize, shapes
from shapely.geometry import shape

from gch4i.config import (
max_year,
min_year,
proxy_data_dir_path,
sector_data_dir_path,
V3_DATA_PATH
)

lower_48_and_DC = ('AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'IA', 'ID', 'IL',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we grab the keys from the us_state_to_abbrev_dict in utils.py for this list?

'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT',
'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA',
'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'DC')

verbose = False

@mark.persist
@task(id="farm_pipeline_proxy")
def get_farm_pipeline_proxy_data(
#Inputs
enverus_midstream_pipelines_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb",
croplands_path_template: Path = sector_data_dir_path / "nass_cdl/{year}_30m_cdls_all_crop_binary.tif",

#Outputs
output_path: Annotated[Path, Product] = proxy_data_dir_path / "farms_pipeline_proxy.parquet"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove s from farms -> farm_pipelines_proxy I've already changed the actual file name.

):

###############################################################
# Load and prepare pipeline data
###############################################################

if verbose: print('loading state and pipeline data')

# Load state geometries and pipeline data (keeping existing code)
state_shapes = gpd.read_file(V3_DATA_PATH / 'geospatial/cb_2018_us_state_500k/cb_2018_us_state_500k.shp')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move this path into the function parameters so it is clear this file is being used in the proxy creation?

state_shapes = state_shapes.rename(columns={'STUSPS': 'state_code'})[['state_code', 'geometry']]
state_shapes = state_shapes[state_shapes['state_code'].isin(lower_48_and_DC)]
state_shapes = state_shapes.to_crs(epsg=4326)

# Load and filter pipeline data
enverus_pipelines_gdf = gpd.read_file(enverus_midstream_pipelines_path, layer='NaturalGasPipelines')
enverus_pipelines_gdf = enverus_pipelines_gdf.to_crs(epsg=4326)
enverus_pipelines_gdf = enverus_pipelines_gdf[
(enverus_pipelines_gdf['STATUS'] == 'Operational') &
(enverus_pipelines_gdf['TYPE'] == 'Transmission')
][['LOC_ID', 'DIAMETER', 'geometry']].reset_index(drop=True)

###############################################################
# Make one year's proxy gdf
###############################################################

def process_year(year, enverus_pipelines_gdf, croplands_path, state_shapes):

year_pipelines_gdf = enverus_pipelines_gdf.copy()

###############################################################
# Load cropland data and find raster intersection
###############################################################

if verbose: print(' loading cropland raster data')
with rasterio.open(croplands_path) as src:
# Read cropland data
cropland_mask = src.read(1) > 0
transform = src.transform
raster_crs = src.crs

#reproject pipelines to raster's CRS for raster operations
year_pipelines_gdf_raster_crs = year_pipelines_gdf.to_crs(raster_crs).copy()

# Rasterize pipelines to same grid as cropland
if verbose: print(' rasterizing pipelines')
pipeline_shapes = ((geom, 1) for geom in year_pipelines_gdf_raster_crs.geometry)
pipeline_raster = rasterize(
pipeline_shapes,
out_shape=cropland_mask.shape,
transform=transform,
dtype=np.uint8
)

# Find intersection areas
if verbose: print(' finding cropland-pipelines intersection')
intersection_mask = (cropland_mask & (pipeline_raster > 0))

###############################################################
# Find vector intersection using raster intersection
###############################################################

# Vectorize only the intersection areas
if verbose: print(' vectorizing cropland raster bins which intersect pipeline bins')
intersection_shapes = shapes(
intersection_mask.astype(np.uint8),
transform=transform,
connectivity=4,
mask=intersection_mask
)
intersection_geometries = [
{"geometry": shape(geom), "properties": {"value": val}}
for geom, val in intersection_shapes if val == 1
]

# Convert intersection shapes to GeoDataFrame
if verbose: print(' put intersecting cropland geometries into gdf')
if intersection_geometries:
intersection_gdf = gpd.GeoDataFrame.from_features(
intersection_geometries,
crs=raster_crs
).to_crs(epsg=4326)

# Intersect with original pipelines to maintain pipeline attributes
if verbose: print(' computing vector overlay between cropland and pipelines')
year_pipelines_gdf = gpd.overlay(
year_pipelines_gdf,
intersection_gdf,
how='intersection',
keep_geom_type=True
)

# Recombine each pipeline into single geometry
if verbose: print(' recombining split pipeline geometries')
year_pipelines_gdf = year_pipelines_gdf.dissolve(
by=['LOC_ID', 'DIAMETER'], #if rows match on all these columns, then combine their geometries into a single geometry in one row
aggfunc='first' #for other columns besides geometry and the matched columns, use the first row's values. there are no other columns, though.
).reset_index()

else:
# Create empty GeoDataFrame with same columns as enverus_pipelines_gdf
year_pipelines_gdf = gpd.GeoDataFrame(
columns=year_pipelines_gdf.columns,
geometry=[],
crs=year_pipelines_gdf.crs
)

###############################################################
# Split by state and create vector proxy gdf
###############################################################

if verbose: print(' splitting by state')

# Split pipelines at state boundaries
enverus_pipelines_gdf_split_by_state = gpd.overlay(
year_pipelines_gdf,
state_shapes,
how='intersection',
keep_geom_type=True
)

proxy_gdf = enverus_pipelines_gdf_split_by_state.copy()
proxy_gdf['year'] = year

###############################################################
# Calculate pipeline lengths and normalize rel_emi
###############################################################

if verbose: print(' computing pipeline lengths and final proxy gdf')

# Calculate pipeline lengths
proxy_gdf = proxy_gdf.to_crs("ESRI:102003")
proxy_gdf['rel_emi'] = proxy_gdf.geometry.length
proxy_gdf = proxy_gdf.to_crs(epsg=4326)

# Normalize relative emissions to sum to 1 for each year and state
proxy_gdf = proxy_gdf.groupby(['state_code', 'year']).filter(lambda x: x['rel_emi'].sum() > 0) #drop state-years with 0 total volume
proxy_gdf['rel_emi'] = proxy_gdf.groupby(['year', 'state_code'])['rel_emi'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) #normalize to sum to 1
sums = proxy_gdf.groupby(["state_code", "year"])["rel_emi"].sum() #get sums to check normalization
assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1

return proxy_gdf


###############################################################
# Process each year, aggregate, and save
###############################################################

year_proxy_gdfs = []
for year in range(min_year, max_year+1):
if verbose: print(f'Processing year {year}')
croplands_path = croplands_path_template.parent / croplands_path_template.name.format(year=year)
year_proxy_gdf = process_year(year, enverus_pipelines_gdf, croplands_path, state_shapes)
year_proxy_gdfs.append(year_proxy_gdf)

# Double check normalization
sums = proxy_gdf.groupby(["state_code", "year"])["rel_emi"].sum() #get sums to check normalization
assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1

# Combine all years into single dataframe
proxy_gdf = pd.concat(year_proxy_gdfs, ignore_index=True)

# Save to parquet
proxy_gdf.to_parquet(output_path)

# return proxy_gdf
# df = get_farm_pipeline_proxy_data()
# df.info()
# df


# %%
98 changes: 98 additions & 0 deletions gch4i/proxy_processing/task_trans_pipelines_proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#%%
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to add a header here before it goes to main?

from pathlib import Path
from typing import Annotated

import geopandas as gpd
import pandas as pd
import numpy as np
from pytask import Product, mark, task

from gch4i.config import (
max_year,
min_year,
proxy_data_dir_path,
sector_data_dir_path,
V3_DATA_PATH
)
from gch4i.utils import (
us_state_to_abbrev_dict
)

lower_48_and_DC = ('AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'IA', 'ID', 'IL',
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can use values from us_state_to_abbrev_dict in utils.py for this?

'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT',
'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA',
'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'DC')

@mark.persist
@task(id="trans_pipeline_proxy")
def get_trans_pipeline_proxy_data(
#Inputs
enverus_midstream_pipelines_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb",

#Outputs
output_path: Annotated[Path, Product] = (
proxy_data_dir_path / "trans_pipeline_proxy.parquet"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ADD "s" to pipelines -> trans_pipelines_proxy. I've changed the file name already.

),
):
###############################################################
# Load data
###############################################################

# load state geometries for continental US
state_shapes = gpd.read_file(V3_DATA_PATH / 'geospatial/cb_2018_us_state_500k/cb_2018_us_state_500k.shp')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move this path into the parameters for the function so we can see it is being used.

state_shapes = state_shapes.rename(columns={'STUSPS': 'state_code'})[['state_code', 'geometry']]
state_shapes = state_shapes[state_shapes['state_code'].isin(lower_48_and_DC)]
state_shapes = state_shapes.to_crs(epsg=4326) # Ensure state_shapes is in EPSG:4326

# load enverus pipelines geometries data
enverus_pipelines_gdf = gpd.read_file(enverus_midstream_pipelines_path, layer='NaturalGasPipelines')
enverus_pipelines_gdf = enverus_pipelines_gdf.to_crs(epsg=4326) # Ensure enverus_pipelines_gdf is in EPSG:4326

#drop unneeded pipelines and columns
# enverus_pipelines_gdf = enverus_pipelines_gdf[enverus_pipelines_gdf['CNTRY_NAME']=='United States'] #dropped, because not used in v2 and state intersection will filter this.
enverus_pipelines_gdf = enverus_pipelines_gdf[enverus_pipelines_gdf['STATUS']=='Operational']
enverus_pipelines_gdf = enverus_pipelines_gdf[enverus_pipelines_gdf['TYPE'] =='Transmission']
enverus_pipelines_gdf = enverus_pipelines_gdf[['LOC_ID','DIAMETER','geometry', #'INSTALL_YR',
]].reset_index(drop=True)

# Split pipelines at state boundaries and keep only the segments that fall within states
enverus_pipelines_gdf_split_by_state = gpd.overlay(enverus_pipelines_gdf, state_shapes, how='intersection', keep_geom_type=True)

#create each year's proxy, and combine to make the full proxy table
# NOTE: this assumes the geometries do not change year to year, since we dont have good data for when each pipeline began operation.
year_gdfs = []
for year in range(min_year, max_year+1):
year_gdf = enverus_pipelines_gdf_split_by_state.copy()
# year_gdf = year_gdf[year_gdf['INSTALL_YR'] > year] # not applicable since most INSTALL_YR are NaN
year_gdf['year'] = year
year_gdfs.append(year_gdf)
proxy_gdf = pd.concat(year_gdfs, ignore_index=True)

#compute the length of each pipeline within each state
proxy_gdf = proxy_gdf.to_crs("ESRI:102003") # Convert to ESRI:102003
proxy_gdf['pipeline_length_within_state'] = proxy_gdf.geometry.length
proxy_gdf = proxy_gdf.to_crs(epsg=4326) # Convert back to EPSG:4326

#get rel_emi
proxy_gdf = proxy_gdf.rename(columns = {'pipeline_length_within_state':'rel_emi'}) #simply assume rel_emi is proportional to length.

###############################################################
# Normalize and save
###############################################################

# Normalize relative emissions to sum to 1 for each year and state
proxy_gdf = proxy_gdf.groupby(['state_code', 'year']).filter(lambda x: x['rel_emi'].sum() > 0) #drop state-years with 0 total volume
proxy_gdf['rel_emi'] = proxy_gdf.groupby(['year', 'state_code'])['rel_emi'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) #normalize to sum to 1
sums = proxy_gdf.groupby(["state_code", "year"])["rel_emi"].sum() #get sums to check normalization
assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1

# Save to parquet
proxy_gdf.to_parquet(output_path)

return proxy_gdf

# proxy_gdf = get_trans_pipeline_proxy_data()

# #%%
# proxy_gdf.info()
# proxy_gdf