From 81119f4195ee4414211693358532c4e63d2fab98 Mon Sep 17 00:00:00 2001 From: Hannah Lohman <68960449+haclohman@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:02:52 -0400 Subject: [PATCH 1/6] ng_compressor_stations_proxy includes gb_stations_proxy, storage_comp_station_proxy, trans_comp_station_proxy --- environment.yml | 3 +- .../task_ng_compressor_stations_proxy.py | 101 ++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 gch4i/proxy_processing/task_ng_compressor_stations_proxy.py diff --git a/environment.yml b/environment.yml index ef183a9..3babe75 100644 --- a/environment.yml +++ b/environment.yml @@ -44,7 +44,8 @@ dependencies: - pyogrio # use - rioxarray - pip - - osgeo + - gdal + # - osgeo - pip: - -e . # this will install the gch4i package in your environment in dev mode \ No newline at end of file diff --git a/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py b/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py new file mode 100644 index 0000000..cc88038 --- /dev/null +++ b/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py @@ -0,0 +1,101 @@ +# %% +from pathlib import Path +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, +) + +from gch4i.utils import us_state_to_abbrev + +# %% +@mark.persist +@task(id="ng_compressor_stations_proxy") +def task_get_ng_compressor_stations_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_midstream_ng_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb", + gb_stations_output_path: Annotated[Path, Product] = proxy_data_dir_path / "gb_stations_proxy.parquet", + storage_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "storage_comp_station_proxy.parquet", + trans_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "trans_comp_station_proxy.parquet", +): + """ + Creation of the following proxies using Enverus Midstream Rextag_Natural_Gas.gdb: + - gb_stations_proxy - gathering compressor stations (NG Production) + - storage_comp_station_proxy - storage compressor stations (NG Storage) + - trans_comp_station_proxy - transmission compressor stations (NG Transmission) + """ + + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .to_crs(4326) + ) + + # Enverus Midstream Natural Gas Compressor Stations + compressor_stations_gdf = (gpd.read_file( + enverus_midstream_ng_path, + layer="CompressorStations", + columns=["NAME", "TYPE", "STATUS", "STATE_NAME", "CNTRY_NAME", "geometry"]) + + .query("STATUS == 'Operational'") + .query("CNTRY_NAME == 'United States'") + .query("STATE_NAME.isin(@state_gdf['state_name'])") + .drop(columns=["STATUS", "CNTRY_NAME"]) + .rename(columns={"NAME": "facility_name", + "TYPE": "type", + "STATE_NAME": "state_name", + }) + .assign(state_code='NaN') + .to_crs(4326) + .reset_index(drop=True) + ) + + for istation in np.arange(0, len(compressor_stations_gdf)): + compressor_stations_gdf.loc[istation, "state_code"] = us_state_to_abbrev(compressor_stations_gdf.loc[istation, "state_name"]) + + # gb_stations_proxy + gb_stations_proxy_gdf = (compressor_stations_gdf + .query("type == 'Gathering'") + .drop(columns=["type", "state_name"]) + .loc[:, ["facility_name", "state_code", "geometry"]] + .reset_index(drop=True)) + gb_stations_proxy_gdf.to_parquet(gb_stations_output_path) + + # storage_comp_station_proxy + storage_comp_station_proxy_gdf = (compressor_stations_gdf + .query("type == 'Storage'") + .drop(columns=["type", "state_name"]) + .loc[:, ["facility_name", "state_code", "geometry"]] + .reset_index(drop=True)) + storage_comp_station_proxy_gdf.to_parquet(storage_comp_station_output_path) + + # trans_comp_station_proxy + trans_comp_station_proxy_gdf = (compressor_stations_gdf + .query("type == 'Transmission'") + .drop(columns=["type", "state_name"]) + .loc[:, ["facility_name", "state_code", "geometry"]] + .reset_index(drop=True)) + trans_comp_station_proxy_gdf.to_parquet(trans_comp_station_output_path) + + return None From 9463c68bb154112f06ca9eb211f41ca37c0f1180 Mon Sep 17 00:00:00 2001 From: Hannah Lohman <68960449+haclohman@users.noreply.github.com> Date: Fri, 25 Oct 2024 10:23:29 -0400 Subject: [PATCH 2/6] start ng_production_proxy start of the natural gas production proxy using enverus prism and di data --- environment.yml | 2 +- .../federal_gom_offshore_proxy.py | 170 +++++++++ .../task_ng_production_proxy.py | 325 ++++++++++++++++++ 3 files changed, 496 insertions(+), 1 deletion(-) create mode 100644 gch4i/proxy_processing/federal_gom_offshore_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_production_proxy.py diff --git a/environment.yml b/environment.yml index 3babe75..16dfe86 100644 --- a/environment.yml +++ b/environment.yml @@ -22,7 +22,7 @@ dependencies: # - plotly # - pyjanitor - pylint - # - pyodbc # added by Nathan -- needed in Wastewater + - pyodbc # added by Nathan -- needed in Wastewater - pyprojroot # - pyshp # added by Hannah -- needed in Petroleum Systems - python-duckdb diff --git a/gch4i/proxy_processing/federal_gom_offshore_proxy.py b/gch4i/proxy_processing/federal_gom_offshore_proxy.py new file mode 100644 index 0000000..92d5b02 --- /dev/null +++ b/gch4i/proxy_processing/federal_gom_offshore_proxy.py @@ -0,0 +1,170 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +from pytask import Product, task, mark +import pyodbc + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, +) + +from gch4i.utils import us_state_to_abbrev + +# %% +@mark.persist +@task(id="federal_gom_offshore_proxy") +def task_get_federal_gom_offshore_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + boem_data_directory_path: Path = sector_data_dir_path / "boem", + ng_output_path: Annotated[Path, Product] = proxy_data_dir_path + / "federal_gom_offshore_proxy.parquet", + oil_output_path: Annotated[Path, Product] = proxy_data_dir_path + / "oil_gom_fed_proxy.parquet", +): + """ + # TODO: + """ + + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .to_crs(4326) + ) + + # get and format boem gom data for 2011, 2014, 2017, and 2021 + # NOTE: 2011 has tblPointER and tblPointEM but the rest of the years have one single table of data + gom_df = {} + gom_data_years = ['2011', '2014', '2017', '2021'] + for idatayear in gom_data_years: + gom_file_name = f"{idatayear}_Gulfwide_Platform_Inventory.accdb" + gom_file_path = os.path.join(boem_data_directory_path, gom_file_name) + driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+gom_file_path+';''' + conn = pyodbc.connect(driver_str) + GOADS_locations = pd.read_sql("SELECT * FROM tblPointER", conn) + GOADS_emissions = pd.read_sql("SELECT * FROM tblPointEM", conn) + conn.close() + + # Format Location Data + GOADS_locations = GOADS_locations[["strStateFacilityIdentifier","strEmissionReleasePointID","dblXCoordinate","dblYCoordinate"]] + #Create platform-by-platform file + GOADS_locations_Unique = pd.DataFrame({'strStateFacilityIdentifier':GOADS_locations['strStateFacilityIdentifier'].unique()}) + GOADS_locations_Unique['lon'] = 0.0 + GOADS_locations_Unique['lat'] = 0.0 + GOADS_locations_Unique['strEmissionReleasePointID'] = '' + + for iplatform in np.arange(len(GOADS_locations_Unique)): + match_platform = np.where(GOADS_locations['strStateFacilityIdentifier'] == GOADS_locations_Unique['strStateFacilityIdentifier'][iplatform])[0][0] + GOADS_locations_Unique.loc[iplatform,'lon',] = GOADS_locations['dblXCoordinate'][match_platform] + GOADS_locations_Unique.loc[iplatform,'lat',] = GOADS_locations['dblYCoordinate'][match_platform] + GOADS_locations_Unique.loc[iplatform,'strEmissionReleasePointID'] = GOADS_locations['strEmissionReleasePointID'][match_platform][:3] + + GOADS_locations_Unique.reset_index(inplace=True, drop=True) + #display(GOADS_locations_Unique) + + #print(GOADS_emissions.columns) + #Format Emissions Data (clean lease data string) + GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode","dblEmissionNumericValue","BOEM-MONTH", + "BOEM-LEASE_NUM","BOEM-COMPLEX_ID"]] + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('OCS','') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('-','') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace(' ','') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G1477','G01477') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G73','00073') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G605','00605') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G72','00072') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G599','00599') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G7155','G07155') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G2357','G02357') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G4921','G04921') + GOADS_emissions['Emis_tg'] = 0.0 + GOADS_emissions['Emis_tg'] = 9.0718474E-7 * GOADS_emissions['dblEmissionNumericValue'] #convert short tons to Tg + GOADS_emissions = GOADS_emissions[GOADS_emissions['strPollutantCode'] == 'CH4'] + GOADS_emissions.reset_index(inplace=True, drop=True) + + #display(GOADS_emissions) + + # Use ERG Preprocessed data to determine if major or minor and oil or gas + ERG_complex_crosswalk = pd.read_excel(ERG_GOADSEmissions_inputfile, sheet_name = "Complex Emissions by Source", usecols = "AJ:AM", nrows = 11143) + + # add data to map array, for the closest year to 2011 + year_diff = [abs(x - 2011) for x in year_range] + iyear = year_diff.index(min(year_diff)) + + #assign oil vs gas by lease/complex ID + GOADS_emissions['LEASE_TYPE'] ='' + GOADS_emissions['MAJOR_STRUC'] ='' + for istruc in np.arange(0,len(GOADS_emissions)): + imatch = np.where(np.logical_and(ERG_complex_crosswalk['BOEM COMPLEX ID.2']==int(GOADS_emissions['BOEM-COMPLEX_ID'][istruc]),\ + ERG_complex_crosswalk['Year.2'] == 2011)) + if np.size(imatch) >0: + imatch = imatch[0][0] + GOADS_emissions.loc[istruc,'LEASE_TYPE'] = ERG_complex_crosswalk['Oil Gas Defn FINAL.1'][imatch] + GOADS_emissions.loc[istruc,'MAJOR_STRUC'] = ERG_complex_crosswalk['Major / Minor.1'][imatch] + else: + print(istruc, GOADS_emissions['BOEM-COMPLEX_ID'][istruc]) + + # for all gas platforms, match the platform to the emissions + if GOADS_emissions['LEASE_TYPE'][istruc] =='Gas': + match_platform = np.where(GOADS_locations_Unique.strStateFacilityIdentifier==GOADS_emissions['strStateFacilityIdentifier'][istruc])[0][0] + ilat = int((GOADS_locations_Unique['lat'][match_platform] - Lat_low)/Res01) + ilon = int((GOADS_locations_Unique['lon'][match_platform] - Lon_left)/Res01) + imonth = GOADS_emissions['BOEM-MONTH'][istruc]-1 #dict is 1-12, not 0-11 + if GOADS_emissions['MAJOR_STRUC'][istruc] =='Major': + Map_GOADSmajor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc] + else: + Map_GOADSminor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc] + + + # sum complexes and emissions for diagnostic + majcplx = GOADS_emissions[(GOADS_emissions['MAJOR_STRUC']=='Major')] + majcplx = majcplx[majcplx['LEASE_TYPE'] =='Gas'] + num_majcplx = majcplx['BOEM-COMPLEX_ID'].unique() + #print(np.shape(num_majcplx)) + mincplx = GOADS_emissions[GOADS_emissions['MAJOR_STRUC']=='Minor'] + mincplx = mincplx[mincplx['LEASE_TYPE'] =='Gas'] + num_mincplx = mincplx['BOEM-COMPLEX_ID'].unique() + #print(np.size(num_mincplx)) + del GOADS_emissions + print('Number of Major Gas Complexes: ',(np.size(num_majcplx))) + print('Emissions (Tg): ',np.sum(Map_GOADSmajor_emissions[:,:,iyear,:])) + print('Number of Minor Gas Complexes: ',(np.size(num_mincplx))) + print('Emissions (Tg): ',np.sum(Map_GOADSminor_emissions[:,:,iyear,:])) + + + # Create proxy gdf + proxy_gdf = ( + gpd.GeoDataFrame( + gb_stations_df, + geometry=gpd.points_from_xy( + gb_stations_df["lon"], + gb_stations_df["lat"], + crs=4326, + ), + ) + .drop(columns=["lat", "lon"]) + .loc[:, ["facility_name", "state_code", "geometry"]] + ) + + proxy_gdf.to_parquet(output_path) + return None diff --git a/gch4i/proxy_processing/task_ng_production_proxy.py b/gch4i/proxy_processing/task_ng_production_proxy.py new file mode 100644 index 0000000..634bbff --- /dev/null +++ b/gch4i/proxy_processing/task_ng_production_proxy.py @@ -0,0 +1,325 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev + +# %% +@mark.persist +@task(id="ng_production_proxy") +def task_get_ng_production_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + nems_region_dict_path: Path = sector_data_dir_path / "enverus/NEMS_Region_Dictionary.xlsx", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx", + output_path: Annotated[Path, Product] = proxy_data_dir_path / "gb_stations_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + TODO: Update enverus_well_counts_path with v3 data (currently using v2 data) + """ + + # STEP 1: Load in State ANSI data and NEMS definitions + + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make NEMS State classifications + # Treat NM and TX separately since these states cover multiple NEMS regions + + # 0 = NE, 1 = MC, 2 = RM, 3 = SW, 4 = WC, 5 = GC, 6 = offshore + NEMS_State = pd.read_excel(nems_region_dict_path) + NEMS_State = NEMS_State.fillna(0) + NM_idx = NEMS_State.index[NEMS_State['State_Name'].str.contains('New Mexico')].tolist() + TX_idx = NEMS_State.index[NEMS_State['State_Name'].str.contains('Texas')].tolist() + idx = NM_idx+TX_idx + NEMS_State= NEMS_State.drop(NEMS_State.index[idx]) + NEMS_State.reset_index(drop=True,inplace=True) + + NEMS_dict = {'North East':0, 'Midcontinent':1,'Rocky Mountain':2,'South West':3,'West Coast':4,'Gulf Coast':5} + + # STEP 2: Read-in and Format Proxy Data + + # STEP 2.1: State Condensate Data + + # TODO: state condensate data code + + # STEP 2.2: GOADS Emissions Data + + # TODO: GOADS emissions data code + + # STEP 2.3: Well and Production Data (from Enverus) + + # STEP 2.3.1: Read In & Combine Each Year of Prism & DI Monthly Data (from Enverus) + + # Data come from Enverus, both Drilling Info and Prism + # The reason 2 datasets are used is because Prism does not include all states + # So remaining states, or those with more DI coverage are taken from DI + + # Read In and Format the Prism and DI data + # 1. Read Data + # 2. Drop unsed columns, rename columns to match between DI and Prism + # 3. Combine DI and Prism into one data array + # 4. Calculate annual cummulate production totals + # 5. Save the data as a year-specific variable + + # Based on ERGs logic, active wells are determined based on their production levels and not producing status + Enverus_data_dict = {} + for iyear in years: + #DI data + DI_file_name = f"didsk_monthly_{iyear}.csv" + DI_file_path = os.path.join(enverus_production_path, DI_file_name) + DI_data = (pd.read_csv( + DI_file_path, + usecols=['WELL_COUNT_ID','STATE','COUNTY','BASIN','AAPG_CODE_ERG', + 'NEMS_REGION_ERG','LATITUDE','LONGITUDE','STATUS','COMPDATE', + 'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR', + 'GOR_QUAL','PROD_FLAG','PRODYEAR', + 'LIQ_01','GAS_01','WTR_01','LIQ_02','GAS_02','WTR_02', + 'LIQ_03','GAS_03','WTR_03','LIQ_04','GAS_04','WTR_04', + 'LIQ_05','GAS_05','WTR_05','LIQ_06','GAS_06','WTR_06', + 'LIQ_07','GAS_07','WTR_07','LIQ_08','GAS_08','WTR_08', + 'LIQ_09','GAS_09','WTR_09','LIQ_10','GAS_10','WTR_10', + 'LIQ_11','GAS_11','WTR_11','LIQ_12','GAS_12','WTR_12',], + dtype={7:'str'}) + .rename(columns={'WELL_COUNT_ID':'WELL_COUNT','STATE':'STATE_CODE', + 'NEMS_REGION_ERG':'NEMS_REGION', 'STATUS':'PRODUCING_STATUS', + 'LIQ_01':'OILPROD_01','GAS_01':'GASPROD_01','WTR_01':'WATERPROD_01', + 'LIQ_02':'OILPROD_02','GAS_02':'GASPROD_02','WTR_02':'WATERPROD_02', + 'LIQ_03':'OILPROD_03','GAS_03':'GASPROD_03','WTR_03':'WATERPROD_03', + 'LIQ_04':'OILPROD_04','GAS_04':'GASPROD_04','WTR_04':'WATERPROD_04', + 'LIQ_05':'OILPROD_05','GAS_05':'GASPROD_05','WTR_05':'WATERPROD_05', + 'LIQ_06':'OILPROD_06','GAS_06':'GASPROD_06','WTR_06':'WATERPROD_06', + 'LIQ_07':'OILPROD_07','GAS_07':'GASPROD_07','WTR_07':'WATERPROD_07', + 'LIQ_08':'OILPROD_08','GAS_08':'GASPROD_08','WTR_08':'WATERPROD_08', + 'LIQ_09':'OILPROD_09','GAS_09':'GASPROD_09','WTR_09':'WATERPROD_09', + 'LIQ_10':'OILPROD_10','GAS_10':'GASPROD_10','WTR_10':'WATERPROD_10', + 'LIQ_11':'OILPROD_11','GAS_11':'GASPROD_11','WTR_11':'WATERPROD_11', + 'LIQ_12':'OILPROD_12','GAS_12':'GASPROD_12','WTR_12':'WATERPROD_12',}) + .assign(WELL_COUNT=1) + ) + + # Prism Data + Prism_file_name = f"prism_monthly_{iyear}.csv" + Prism_file_path = os.path.join(enverus_production_path, Prism_file_name) + Prism_data = (pd.read_csv( + Prism_file_path, + usecols=['STATE','COUNTY','ENVBASIN','AAPG_CODE_ERG', + 'NEMS_REGION_ERG','LATITUDE','LONGITUDE','ENVWELLSTATUS','COMPLETIONDATE', + 'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR', + 'GOR_QUAL','PROD_FLAG','PRODYEAR', + 'LIQUIDSPROD_BBL_01','GASPROD_MCF_01','WATERPROD_BBL_01', + 'LIQUIDSPROD_BBL_02','GASPROD_MCF_02','WATERPROD_BBL_02', + 'LIQUIDSPROD_BBL_03','GASPROD_MCF_03','WATERPROD_BBL_03', + 'LIQUIDSPROD_BBL_04','GASPROD_MCF_04','WATERPROD_BBL_04', + 'LIQUIDSPROD_BBL_05','GASPROD_MCF_05','WATERPROD_BBL_05', + 'LIQUIDSPROD_BBL_06','GASPROD_MCF_06','WATERPROD_BBL_06', + 'LIQUIDSPROD_BBL_07','GASPROD_MCF_07','WATERPROD_BBL_07', + 'LIQUIDSPROD_BBL_08','GASPROD_MCF_08','WATERPROD_BBL_08', + 'LIQUIDSPROD_BBL_09','GASPROD_MCF_09','WATERPROD_BBL_09', + 'LIQUIDSPROD_BBL_10','GASPROD_MCF_10','WATERPROD_BBL_10', + 'LIQUIDSPROD_BBL_11','GASPROD_MCF_11','WATERPROD_BBL_11', + 'LIQUIDSPROD_BBL_12','GASPROD_MCF_12','WATERPROD_BBL_12',], + dtype={7:'str'}) + .rename(columns={'STATE':'STATE_CODE', 'ENVBASIN':'BASIN', + 'NEMS_REGION_ERG':'NEMS_REGION', 'ENVWELLSTATUS':'PRODUCING_STATUS', + 'COMPLETIONDATE':'COMPDATE', + 'LIQUIDSPROD_BBL_01':'OILPROD_01','GASPROD_MCF_01':'GASPROD_01','WATERPROD_BBL_01':'WATERPROD_01', + 'LIQUIDSPROD_BBL_02':'OILPROD_02','GASPROD_MCF_02':'GASPROD_02','WATERPROD_BBL_02':'WATERPROD_02', + 'LIQUIDSPROD_BBL_03':'OILPROD_03','GASPROD_MCF_03':'GASPROD_03','WATERPROD_BBL_03':'WATERPROD_03', + 'LIQUIDSPROD_BBL_04':'OILPROD_04','GASPROD_MCF_04':'GASPROD_04','WATERPROD_BBL_04':'WATERPROD_04', + 'LIQUIDSPROD_BBL_05':'OILPROD_05','GASPROD_MCF_05':'GASPROD_05','WATERPROD_BBL_05':'WATERPROD_05', + 'LIQUIDSPROD_BBL_06':'OILPROD_06','GASPROD_MCF_06':'GASPROD_06','WATERPROD_BBL_06':'WATERPROD_06', + 'LIQUIDSPROD_BBL_07':'OILPROD_07','GASPROD_MCF_07':'GASPROD_07','WATERPROD_BBL_07':'WATERPROD_07', + 'LIQUIDSPROD_BBL_08':'OILPROD_08','GASPROD_MCF_08':'GASPROD_08','WATERPROD_BBL_08':'WATERPROD_08', + 'LIQUIDSPROD_BBL_09':'OILPROD_09','GASPROD_MCF_09':'GASPROD_09','WATERPROD_BBL_09':'WATERPROD_09', + 'LIQUIDSPROD_BBL_10':'OILPROD_10','GASPROD_MCF_10':'GASPROD_10','WATERPROD_BBL_10':'WATERPROD_10', + 'LIQUIDSPROD_BBL_11':'OILPROD_11','GASPROD_MCF_11':'GASPROD_11','WATERPROD_BBL_11':'WATERPROD_11', + 'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12',}) + .assign(WELL_COUNT=1) + ) + + # Combine into one array with common column names, replace nans with zeros, and sum annual production + Enverus_data = pd.concat([DI_data, Prism_data], ignore_index=True) + Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')].fillna(0) + Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')].fillna(0) + Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')].fillna(0) + + # Calculate cummulative annual production totals for Gas, Oil, Water + Enverus_data['CUM_GAS'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('GASPROD_')].sum(1) + Enverus_data['CUM_OIL'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('OILPROD_')].sum(1) + Enverus_data['CUM_WATER'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('WATERPROD_')].sum(1) + + Enverus_data['NEMS_CODE'] = Enverus_data['NEMS_REGION'].map(NEMS_dict) + + # Save out the data for that year + Enverus_data_dict[f'{iyear}'] = Enverus_data + + del Prism_data + del DI_data #save memory space + + #define default values for a new row in this table (to be used later during data corrections) + default = {'WELL_COUNT': 0, 'STATE_CODE':'','COUNTY':'','NEMS_REGION':'UNK', + 'AAPG_CODE_ERG':'UNK','LATITUDE':0,'LONGITUDE':0, + 'PRODUCING_STATUS':'','BASIN':'','SPUDDATE':'','COMPDATE':'', + 'FIRSTPRODDATE':'','HF':'', 'OFFSHORE':'','GOR':-99, + 'GOR_QUAL':'','PROD_FLAG':'','PRODYEAR':'', + 'OILPROD_01':0, 'GASPROD_01':0, 'WATERPROD_01':0,'OILPROD_02':0, 'GASPROD_02':0, 'WATERPROD_02':0, + 'OILPROD_03':0, 'GASPROD_03':0, 'WATERPROD_03':0,'OILPROD_04':0, 'GASPROD_04':0, 'WATERPROD_04':0,\ + 'OILPROD_05':0, 'GASPROD_05':0, 'WATERPROD_05':0,'OILPROD_06':0, 'GASPROD_06':0, 'WATERPROD_06':0,\ + 'OILPROD_07':0, 'GASPROD_07':0, 'WATERPROD_07':0,'OILPROD_08':0, 'GASPROD_08':0, 'WATERPROD_08':0,\ + 'OILPROD_09':0, 'GASPROD_09':0, 'WATERPROD_09':0,'OILPROD_10':0, 'GASPROD_10':0, 'WATERPROD_10':0,\ + 'OILPROD_11':0, 'GASPROD_11':0, 'WATERPROD_11':0,'OILPROD_12':0, 'GASPROD_12':0, 'WATERPROD_12':0, + 'CUM_GAS':0, 'CUM_OIL':0, 'CUM_WATER':0, 'NEMS_CODE':99} + + # Correct the NEMS Code for missing NEMS_REGIONS + # Note OFFSHORE regions will have NaN as NEMS_Code + for iyear in years: + enverus_data_temp = Enverus_data_dict[f'{iyear}'] + list_well = enverus_data_temp.index[pd.isna(enverus_data_temp.loc[:,'NEMS_REGION'])].tolist() + if np.size(list_well) > 0: + for irow in list_well: + match_state = np.where(NEMS_State['State_Code']==enverus_data_temp['STATE_CODE'][irow])[0][0] + enverus_data_temp.loc[irow,'NEMS_CODE'] = NEMS_State['NEMS'][match_state].astype(int) + Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy() + + # STEP 2.3.2: Correct Enverus Data for Select States + + # 1) Read In Coverage Table from State Well Counts File from ERG + # (specifies the first year with bad data and which years need to be corrected; + # all years including and after the first bad year of data need to be corrected) + + ERG_StateWellCounts_LastGoodDataYear = (pd.read_excel( + enverus_well_counts_path, + sheet_name = "2021 - Coverage", + usecols = {"State","Last Good Year"}, + skiprows = 2, + nrows = 40) + ) + + # 2) Loops through the each state and year in Enverus to determine if the data for that particualar year needs to + # be corrected. At the moment, the only corrections ERG makes to the data is to use the prior year of data if there + # is no new Enverus data reportd for that state. If a particular state is not included for any years in the Enverus + # dataset, then a row of zeros is added to the Enverus table for that year. + + for istate in np.arange(0,len(state_gdf)): + correctdata =0 + istate_code = state_gdf['state_code'][istate] + lastgoodyear = ERG_StateWellCounts_LastGoodDataYear['Last Good Year'][ERG_StateWellCounts_LastGoodDataYear['State'] == istate_code].values + if lastgoodyear == max_year: + lastgoodyear = max_year+5 #if state isn't included in correction list, don't correct any data + + for iyear in years: + enverus_data_temp= Enverus_data_dict[f'{iyear}'].copy() + state_list = np.unique(enverus_data_temp['STATE_CODE']) + if istate_code in state_list: + inlist =1 + else: + inlist = 0 + if inlist ==1 or correctdata==1: #if the state is included in Enverus data, or had data for at least one good year + #if first year, correctdata will be zero, but inlist will also be zero if no Enverus data + #check to see whether corrections are necessary for the given year/state + if iyear == (lastgoodyear): + print(istate_code,iyear,'last good year') + # This is the last year of good data. Do not correct the data but save + # but so that this data can be used for all following years for that state + temp_data = enverus_data_temp[enverus_data_temp['STATE_CODE'] == istate_code] + correctdata=1 + elif iyear > lastgoodyear: + print(istate_code,iyear) + #correct data for all years equal to and after the first bad year (remove old data first if necessary) + if inlist == 1: + enverus_data_temp = enverus_data_temp[enverus_data_temp['STATE_CODE'] != istate_code] + enverus_data_temp = pd.concat([enverus_data_temp,temp_data],ignore_index=True) + print(istate_code +' data for ' +str(iyear) +' were corrected with '+str(lastgoodyear)+' data') + else: + # year_range[iyear] < firstbadyear: + #no data corrections if the current year is before the first bad year + #print('no corrections') + #print(state_str,year_range[iyear]) + no_corrections =1 + + if inlist==0 and correctdata==0: + #if there is no Enverus data for a given state, and there was no good data, add a row with default values + # temp_row = {'STATE':istate_code} + # enverus_data_temp = enverus_data_temp.append({**default,**temp_row}, ignore_index=True) + print(istate_code +' has no Enverus data in the year ' +str(iyear)) + + #resave that year of Enverus data + enverus_data_temp.reset_index(drop=True,inplace=True) + Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy() + + # STEP 2.4: Calculate Fractional Monthly Condensate Arrays + # (EIA condensate production (bbl) relative to producing Enverus gas wells by month + # in each state and region) + + # TODO: fractional monthly condensate array code + + # STEP 2.5: Convert Enverus Well Production Arrays and Condensate Array into Gridded + # Location Arrays + + # clear variables + # del ERG_StateWellCounts_FirstBadDataYear + # del Prism_data + # del colnames + # del names + # del state_condensates + # del temp_data + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are completed but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the prsence of a well will only be included in maps in months where monthly gas prod > From d092b2351f2746293c2b2048313239146814a038 Mon Sep 17 00:00:00 2001 From: Hannah Lohman <68960449+haclohman@users.noreply.github.com> Date: Mon, 2 Dec 2024 19:30:41 -0500 Subject: [PATCH 3/6] natural gas proxies --- .../federal_gom_offshore_proxy.py | 105 ++- .../task_ng_compressor_stations_proxy.py | 6 +- .../task_ng_production_proxy.py | 688 +++++++++++++++++- 3 files changed, 792 insertions(+), 7 deletions(-) diff --git a/gch4i/proxy_processing/federal_gom_offshore_proxy.py b/gch4i/proxy_processing/federal_gom_offshore_proxy.py index 92d5b02..3eb351d 100644 --- a/gch4i/proxy_processing/federal_gom_offshore_proxy.py +++ b/gch4i/proxy_processing/federal_gom_offshore_proxy.py @@ -49,12 +49,115 @@ def task_get_federal_gom_offshore_proxy_data( .astype({"statefp": int}) # get only lower 48 + DC .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) .to_crs(4326) ) # get and format boem gom data for 2011, 2014, 2017, and 2021 # NOTE: 2011 has tblPointER and tblPointEM but the rest of the years have one single table of data - gom_df = {} + gom_df = pd.DataFrame() + + # 2011 GOADS Data + + # Read In and Format 2011 BEOM Data + gom_file_name = f"2011_Gulfwide_Platform_Inventory.accdb" + gom_file_path = os.path.join(boem_data_directory_path, gom_file_name) + driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+gom_file_path+';''' + conn = pyodbc.connect(driver_str) + GOADS_locations = pd.read_sql("SELECT * FROM tblPointER", conn) + GOADS_emissions = pd.read_sql("SELECT * FROM tblPointEM", conn) + conn.close() + + # Format Location Data + GOADS_locations = GOADS_locations[["strStateFacilityIdentifier","strEmissionReleasePointID","dblXCoordinate","dblYCoordinate"]] + #Create platform-by-platform file + GOADS_locations_Unique = pd.DataFrame({'strStateFacilityIdentifier':GOADS_locations['strStateFacilityIdentifier'].unique()}) + GOADS_locations_Unique['lon'] = 0.0 + GOADS_locations_Unique['lat'] = 0.0 + GOADS_locations_Unique['strEmissionReleasePointID'] = '' + + for iplatform in np.arange(len(GOADS_locations_Unique)): + match_platform = np.where(GOADS_locations['strStateFacilityIdentifier'] == GOADS_locations_Unique['strStateFacilityIdentifier'][iplatform])[0][0] + GOADS_locations_Unique.loc[iplatform,'lon',] = GOADS_locations['dblXCoordinate'][match_platform] + GOADS_locations_Unique.loc[iplatform,'lat',] = GOADS_locations['dblYCoordinate'][match_platform] + GOADS_locations_Unique.loc[iplatform,'strEmissionReleasePointID'] = GOADS_locations['strEmissionReleasePointID'][match_platform][:3] + + GOADS_locations_Unique.reset_index(inplace=True, drop=True) + #display(GOADS_locations_Unique) + + #print(GOADS_emissions.columns) + #Format Emissions Data (clean lease data string) + GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode","dblEmissionNumericValue","BOEM-MONTH", + "BOEM-LEASE_NUM","BOEM-COMPLEX_ID"]] + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('OCS','') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('-','') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace(' ','') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G1477','G01477') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G73','00073') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G605','00605') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G72','00072') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G599','00599') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G7155','G07155') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G2357','G02357') + GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G4921','G04921') + GOADS_emissions['Emis_tg'] = 0.0 + GOADS_emissions['Emis_tg'] = 9.0718474E-7 * GOADS_emissions['dblEmissionNumericValue'] #convert short tons to Tg + GOADS_emissions = GOADS_emissions[GOADS_emissions['strPollutantCode'] == 'CH4'] + GOADS_emissions.reset_index(inplace=True, drop=True) + + #display(GOADS_emissions) + + # Use ERG Preprocessed data to determine if major or minor and oil or gas + ERG_complex_crosswalk = pd.read_excel(ERG_GOADSEmissions_inputfile, sheet_name = "Complex Emissions by Source", usecols = "AJ:AM", nrows = 11143) + #display(ERG_complex_crosswalk) + + # add data to map array, for the closest year to 2011 + year_diff = [abs(x - 2011) for x in year_range] + iyear = year_diff.index(min(year_diff)) + + #assign oil vs gas by lease/complex ID + GOADS_emissions['LEASE_TYPE'] ='' + GOADS_emissions['MAJOR_STRUC'] ='' + for istruc in np.arange(0,len(GOADS_emissions)): + imatch = np.where(np.logical_and(ERG_complex_crosswalk['BOEM COMPLEX ID.2']==int(GOADS_emissions['BOEM-COMPLEX_ID'][istruc]),\ + ERG_complex_crosswalk['Year.2'] == 2011)) + if np.size(imatch) >0: + imatch = imatch[0][0] + GOADS_emissions.loc[istruc,'LEASE_TYPE'] = ERG_complex_crosswalk['Oil Gas Defn FINAL.1'][imatch] + GOADS_emissions.loc[istruc,'MAJOR_STRUC'] = ERG_complex_crosswalk['Major / Minor.1'][imatch] + else: + print(istruc, GOADS_emissions['BOEM-COMPLEX_ID'][istruc]) + + # for all gas platforms, match the platform to the emissions + if GOADS_emissions['LEASE_TYPE'][istruc] =='Gas': + match_platform = np.where(GOADS_locations_Unique.strStateFacilityIdentifier==GOADS_emissions['strStateFacilityIdentifier'][istruc])[0][0] + ilat = int((GOADS_locations_Unique['lat'][match_platform] - Lat_low)/Res01) + ilon = int((GOADS_locations_Unique['lon'][match_platform] - Lon_left)/Res01) + imonth = GOADS_emissions['BOEM-MONTH'][istruc]-1 #dict is 1-12, not 0-11 + if GOADS_emissions['MAJOR_STRUC'][istruc] =='Major': + Map_GOADSmajor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc] + else: + Map_GOADSminor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc] + + + # sum complexes and emissions for diagnostic + majcplx = GOADS_emissions[(GOADS_emissions['MAJOR_STRUC']=='Major')] + majcplx = majcplx[majcplx['LEASE_TYPE'] =='Gas'] + num_majcplx = majcplx['BOEM-COMPLEX_ID'].unique() + #print(np.shape(num_majcplx)) + mincplx = GOADS_emissions[GOADS_emissions['MAJOR_STRUC']=='Minor'] + mincplx = mincplx[mincplx['LEASE_TYPE'] =='Gas'] + num_mincplx = mincplx['BOEM-COMPLEX_ID'].unique() + #print(np.size(num_mincplx)) + del GOADS_emissions + print('Number of Major Gas Complexes: ',(np.size(num_majcplx))) + print('Emissions (Tg): ',np.sum(Map_GOADSmajor_emissions[:,:,iyear,:])) + print('Number of Minor Gas Complexes: ',(np.size(num_mincplx))) + print('Emissions (Tg): ',np.sum(Map_GOADSminor_emissions[:,:,iyear,:])) + + + + gom_data_years = ['2011', '2014', '2017', '2021'] for idatayear in gom_data_years: gom_file_name = f"{idatayear}_Gulfwide_Platform_Inventory.accdb" diff --git a/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py b/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py index cc88038..e0d4bb7 100644 --- a/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py +++ b/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py @@ -30,9 +30,9 @@ def task_get_ng_compressor_stations_proxy_data( state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", enverus_midstream_ng_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb", - gb_stations_output_path: Annotated[Path, Product] = proxy_data_dir_path / "gb_stations_proxy.parquet", - storage_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "storage_comp_station_proxy.parquet", - trans_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "trans_comp_station_proxy.parquet", + gb_stations_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_gb_stations_proxy.parquet", + storage_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_storage_comp_station_proxy.parquet", + trans_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_trans_comp_station_proxy.parquet", ): """ Creation of the following proxies using Enverus Midstream Rextag_Natural_Gas.gdb: diff --git a/gch4i/proxy_processing/task_ng_production_proxy.py b/gch4i/proxy_processing/task_ng_production_proxy.py index 634bbff..09fff7d 100644 --- a/gch4i/proxy_processing/task_ng_production_proxy.py +++ b/gch4i/proxy_processing/task_ng_production_proxy.py @@ -12,6 +12,7 @@ import geopandas as gpd import numpy as np import seaborn as sns +import shapefile as shp from pytask import Product, task, mark from gch4i.config import ( @@ -34,7 +35,21 @@ def task_get_ng_production_proxy_data( nems_region_dict_path: Path = sector_data_dir_path / "enverus/NEMS_Region_Dictionary.xlsx", enverus_production_path: Path = sector_data_dir_path / "enverus/production", enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx", - output_path: Annotated[Path, Product] = proxy_data_dir_path / "gb_stations_proxy.parquet", + nei_path: Path = sector_data_dir_path / "nei_og", + all_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_count_proxy.parquet", + conv_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_count_proxy.parquet", + hf_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_count_proxy.parquet", + all_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_prod_proxy.parquet", + basin_220_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_220_prod_proxy.parquet", + basin_395_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_395_prod_proxy.parquet", + basin_430_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_430_prod_proxy.parquet", + basin_other_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_other_prod_proxy.parquet", + water_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_water_prod_proxy.parquet", + conv_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_comp_proxy.parquet", + hf_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_comp_proxy.parquet", + drilled_well_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_drilled_well_proxy.parquet", + state_gom_offshore_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_count_proxy.parquet", + state_gom_offshore_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_prod_proxy.parquet", ): """ Data come from Enverus, both Drilling Info and Prism @@ -57,6 +72,13 @@ def task_get_ng_production_proxy_data( TODO: Update enverus_well_counts_path with v3 data (currently using v2 data) """ + # Functions: + # Define safe devide to set result to zero if denominator is zero + def safe_div(x,y): + if y == 0: + return 0 + return x / y + # STEP 1: Load in State ANSI data and NEMS definitions state_gdf = ( @@ -112,6 +134,8 @@ def task_get_ng_production_proxy_data( # Based on ERGs logic, active wells are determined based on their production levels and not producing status Enverus_data_dict = {} + DI_data_dict = {} + Prism_data_dict = {} for iyear in years: #DI data DI_file_name = f"didsk_monthly_{iyear}.csv" @@ -143,8 +167,43 @@ def task_get_ng_production_proxy_data( 'LIQ_10':'OILPROD_10','GAS_10':'GASPROD_10','WTR_10':'WATERPROD_10', 'LIQ_11':'OILPROD_11','GAS_11':'GASPROD_11','WTR_11':'WATERPROD_11', 'LIQ_12':'OILPROD_12','GAS_12':'GASPROD_12','WTR_12':'WATERPROD_12',}) - .assign(WELL_COUNT=1) + .assign(WELL_COUNT=1) # TODO: Check to see if this should actually be set to 1 ) + # Format completion date (YYYY-MM) + for iwell in range(0,len(DI_data)): + comp_date = str(DI_data.loc[iwell, 'COMPDATE']) + if comp_date == 'NaN': + comp_year_month = 'NaN' + elif comp_date == 'nan': + comp_year_month = 'NaN' + else: # date format M/DD/YYYY + comp_month = f"{int(comp_date.split('/')[0]):02}" + comp_year = f"{int(comp_date.split('/')[2])}" + comp_year_month = str(comp_year)+'-'+str(comp_month) + DI_data.loc[iwell, 'comp_year_month'] = comp_year_month + # Format spud date (YYYY) + for iwell in range(0,len(DI_data)): + spud_date = str(DI_data.loc[iwell, 'SPUDDATE']) + if spud_date == 'NaN': + spud_year = 'NaN' + elif spud_date == 'nan': + spud_year = 'NaN' + else: # date format M/DD/YYYY + spud_year = f"{int(spud_date.split('/')[2])}" + spud_year = str(spud_year) + DI_data.loc[iwell, 'spud_year'] = spud_year + # Format first production date (YYYY) + for iwell in range(0,len(DI_data)): + first_prod_date = str(DI_data.loc[iwell, 'FIRSTPRODDATE']) + if first_prod_date == 'NaN': + first_prod_year = 'NaN' + elif first_prod_date == 'nan': + first_prod_year = 'NaN' + else: # date format M/DD/YYYY + first_prod_year = f"{int(first_prod_date.split('/')[2])}" + first_prod_year = str(first_prod_year) + DI_data.loc[iwell, 'first_prod_year'] = first_prod_year + DI_data_dict[f'{iyear}'] = DI_data # Prism Data Prism_file_name = f"prism_monthly_{iyear}.csv" @@ -185,6 +244,41 @@ def task_get_ng_production_proxy_data( 'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12',}) .assign(WELL_COUNT=1) ) + # Format completion date (YYYY-MM) + for iwell in range(0,len(Prism_data)): + comp_date = str(Prism_data.loc[iwell, 'COMPDATE']) + if comp_date == 'NaN': + comp_year_month = 'NaN' + elif comp_date == 'nan': + comp_year_month = 'NaN' + else: # date format YYYY-MM-DD + comp_month = f"{int(comp_date.split('-')[1]):02}" + comp_year = f"{int(comp_date.split('-')[0])}" + comp_year_month = str(comp_year)+'-'+str(comp_month) + Prism_data.loc[iwell, 'comp_year_month'] = comp_year_month + # Format spud date (YYYY) + for iwell in range(0,len(Prism_data)): + spud_date = str(Prism_data.loc[iwell, 'SPUDDATE']) + if spud_date == 'NaN': + spud_year = 'NaN' + elif spud_date == 'nan': + spud_year = 'NaN' + else: # date format YYYY-MM-DD + spud_year = f"{int(spud_date.split('-')[0])}" + spud_year = str(spud_year) + Prism_data.loc[iwell, 'spud_year'] = spud_year + # Format first production date (YYYY) + for iwell in range(0,len(Prism_data)): + first_prod_date = str(Prism_data.loc[iwell, 'FIRSTPRODDATE']) + if first_prod_date == 'NaN': + first_prod_year = 'NaN' + elif first_prod_date == 'nan': + first_prod_year = 'NaN' + else: # date format YYYY-MM-DD + first_prod_year = f"{int(first_prod_date.split('-')[0])}" + first_prod_year = str(first_prod_year) + Prism_data.loc[iwell, 'first_prod_year'] = first_prod_year + Prism_data_dict[f'{iyear}'] = Prism_data # Combine into one array with common column names, replace nans with zeros, and sum annual production Enverus_data = pd.concat([DI_data, Prism_data], ignore_index=True) @@ -322,4 +416,592 @@ def task_get_ng_production_proxy_data( # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), - # but the prsence of a well will only be included in maps in months where monthly gas prod > + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + # Well Counts + all_well_count_df = pd.DataFrame() # Active gas well (conventional + HF) counts in a given month + conv_well_count_df = pd.DataFrame() # Active conventional gas well counts in a given month + hf_well_count_df = pd.DataFrame() # Active HF gas well counts in a given month + # Well-Level Production Volumes + all_well_prod_df = pd.DataFrame() # Active gas well (conventional + HF) gas production in a given month + basin_220_prod_df = pd.DataFrame() # Gas well gas production in Basin 220 in a given month + basin_395_prod_df = pd.DataFrame() # Gas well gas production in Basin 395 in a given month + basin_430_prod_df = pd.DataFrame() # Gas well gas production in Basin 430 in a given month + basin_other_prod_df = pd.DataFrame() # Gas well gas production in Other Basins in a given month + # Water Production Volumes + water_prod_df = pd.DataFrame() + # Well Completions + conv_well_comp_df = pd.DataFrame() # Conventional gas well completions + hf_well_comp_df = pd.DataFrame() # HF gas well completions + # Drilled Gas Wells + drilled_well_df = pd.DataFrame() # Gas wells drilled + # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico + state_gom_offshore_well_count_df = pd.DataFrame() # Offshore state GOM gas well counts + state_gom_offshore_well_prod_df = pd.DataFrame() # Offshore state GOM gas production + + + # Query Enverus data to create dictionaries of proxy data + for iyear in years: + enverus_data_temp = Enverus_data_dict[f'{iyear}'].copy() + + # Onshore Natural Gas + ng_data_temp = (enverus_data_temp + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + # Offshore Natural Gas Wells + ng_offshore_data_temp = (enverus_data_temp + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'Y'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + water_prod_str = 'WATERPROD_'+imonth_str + # onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str,water_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # offshore data for imonth + ng_offshore_data_imonth_temp = (ng_offshore_data_temp + .query(f"{prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_offshore_data_imonth_temp[[ + 'year','year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str,water_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Well Counts + # All Gas Well Count + all_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + all_well_count_df = pd.concat([all_well_count_df,all_well_count_imonth]) + # Conventional Gas Well Count + conv_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']] + .query("HF != 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + conv_well_count_df = pd.concat([conv_well_count_df,conv_well_count_imonth]) + # HF Gas Well Count + hf_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']] + .query("HF == 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + hf_well_count_df = pd.concat([hf_well_count_df,hf_well_count_imonth]) + + # Gas Production + # All Gas Well Gas Production + all_well_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]] + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str]) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + all_well_prod_df = pd.concat([all_well_prod_df,all_well_prod_imonth]) + # Basin 220 Gas Well Gas Production + basin_220_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] + .query("AAPG_CODE_ERG == '220'") + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_220_prod_df = pd.concat([basin_220_prod_df,basin_220_prod_imonth]) + # Basin 395 Gas Well Gas Production + basin_395_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] + .query("AAPG_CODE_ERG == '395'") + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_395_prod_df = pd.concat([basin_395_prod_df,basin_395_prod_imonth]) + # Basin 430 Gas Well Gas Production + basin_430_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] + .query("AAPG_CODE_ERG == '430'") + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_430_prod_df = pd.concat([basin_430_prod_df,basin_430_prod_imonth]) + # Other Basins Gas Well Gas Production + basin_other_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] + .query("AAPG_CODE_ERG != '220' & AAPG_CODE_ERG != '395' & AAPG_CODE_ERG != '430'") + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_other_prod_df = pd.concat([basin_other_prod_df,basin_other_prod_imonth]) + + # Water Production + # Data Source by state defined in Enverus DrillingInfo Processing - Produced + # Water_2023-11-14_forGridding.xlsx file. + if iyear < 2016: # WV uses NEI data + water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA', + 'MI','MO','MS','MT','ND','NE','NM','NV', + 'NY','OH','SD','TX','UT','VA','WY' + ] + # States using NEI for reference: ['IL','IN','KS','OK','PA','WV'] + else: # 2016 and beyond; WV uses Enverus data + water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA', + 'MI','MO','MS','MT','ND','NE','NM','NV', + 'NY','OH','SD','TX','UT','VA','WY','WV' + ] #WV uses Enverus + # States using NEI for reference: ['IL','IN','KS','OK','PA'] + # Enverus water production for applicable states (NEI water producted will + # be added in the NEI section of the code below) + water_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',water_prod_str]] + .query("STATE_CODE.isin(@water_prod_enverus_states)") + .assign(proxy_data=lambda df: df[water_prod_str]) + .drop(columns=[water_prod_str]) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + water_prod_df = pd.concat([water_prod_df,water_prod_imonth]) + + # Well Completions + # Conventional Gas Well Completions + conv_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']] + .query("HF != 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .query(f"comp_year_month == {year_month_str}") + .drop(columns=["comp_year_month"]) + .reset_index(drop=True) + ) + conv_well_comp_df = pd.concat([conv_well_comp_df,conv_well_comp_imonth]) + + # HF Gas Well Completions + hf_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']] + .query("HF == 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .query(f"comp_year_month == '{year_month_str}'") + .drop(columns=["comp_year_month"]) + .reset_index(drop=True) + ) + hf_well_comp_df = pd.concat([hf_well_comp_df,hf_well_comp_imonth]) + + # Drilled Gas Wells + drilled_well_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','spud_year','first_prod_year']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + # wells with a spud date or first production date in the current year + .query(f"spud_year == '{iyear}' | first_prod_year == '{iyear}'") + # wells with a spud_year == iyear or if no spud date, first_prod_year == iyear + .query(f"spud_year == '{iyear}' | spud_year == 'NaN'") + .drop(columns=['hf', 'spud_year', 'first_prod_year']) + .reset_index(drop=True) + ) + drilled_well_df = pd.concat([drilled_well_df,drilled_well_imonth]) + + # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico + state_gom_offshore_states = ['AL','FL','LA','MS','TX'] + # Offshore State GOM Gas Well Counts + state_gom_offshore_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + state_gom_offshore_well_count_df = pd.concat([state_gom_offshore_well_count_df,state_gom_offshore_well_count_imonth]) + # Offshore State GOM Gas Well Gas Production + state_gom_offshore_well_prod_imonth = (ng_offshore_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]] + .query("STATE_CODE.isin(@state_gom_offshore_states)") + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str]) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + state_gom_offshore_well_prod_df = pd.concat([state_gom_offshore_well_prod_df,state_gom_offshore_well_prod_imonth]) + + # Calculate Relative Emissions + def calc_enverus_rel_emi(df): + df['rel_emi'] = df.groupby(["state_code", "year"])['proxy_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + df = df.drop(columns='proxy_data') + return df + + # Well Counts + all_well_count_df = calc_enverus_rel_emi(all_well_count_df) + conv_well_count_df = calc_enverus_rel_emi(conv_well_count_df) + hf_well_count_df = calc_enverus_rel_emi(hf_well_count_df) + # Well-Level Production Volumes + all_well_prod_df = calc_enverus_rel_emi(all_well_prod_df) + basin_220_prod_df = calc_enverus_rel_emi(basin_220_prod_df) + basin_395_prod_df = calc_enverus_rel_emi(basin_395_prod_df) + basin_430_prod_df = calc_enverus_rel_emi(basin_430_prod_df) + basin_other_prod_df = calc_enverus_rel_emi(basin_other_prod_df) + # Water Production Volumes + water_prod_df = calc_enverus_rel_emi(water_prod_df) + # Well Completions + conv_well_comp_df = calc_enverus_rel_emi(conv_well_comp_df) + hf_well_comp_df = calc_enverus_rel_emi(hf_well_comp_df) + # Drilled Gas Wells + drilled_well_df = calc_enverus_rel_emi(drilled_well_df) + # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico + state_gom_offshore_well_count_df = calc_enverus_rel_emi(state_gom_offshore_well_count_df) + state_gom_offshore_well_prod_df = calc_enverus_rel_emi(state_gom_offshore_well_prod_df) + + # Format Proxy Data into Geodataframes + def enverus_df_to_gdf(df): + gdf = ( + gpd.GeoDataFrame( + df, + geometry=gpd.points_from_xy( + df["longitude"], + df["latitude"], + crs=4326 + ) + ) + .drop(columns=["latitude", "longitude"]) + .loc[:, ["year", "year_month", "state_code", "rel_emi", "geometry"]] + ) + return gdf + + # Well Counts + all_well_count_gdf = enverus_df_to_gdf(all_well_count_df) + conv_well_count_gdf = enverus_df_to_gdf(conv_well_count_df) + hf_well_count_gdf = enverus_df_to_gdf(hf_well_count_df) + # Well-Level Production Volumes + all_well_prod_gdf = enverus_df_to_gdf(all_well_prod_df) + basin_220_prod_gdf = enverus_df_to_gdf(basin_220_prod_df) + basin_395_prod_gdf = enverus_df_to_gdf(basin_395_prod_df) + basin_430_prod_gdf = enverus_df_to_gdf(basin_430_prod_df) + basin_other_prod_gdf = enverus_df_to_gdf(basin_other_prod_df) + # Water Production Volumes + water_prod_gdf = enverus_df_to_gdf(water_prod_df) + # Well Completions + conv_well_comp_gdf = enverus_df_to_gdf(conv_well_comp_df) + hf_well_comp_gdf = enverus_df_to_gdf(hf_well_comp_df) + # Drilled Gas Wells + drilled_well_gdf = enverus_df_to_gdf(drilled_well_df) + # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico + state_gom_offshore_well_count_gdf = enverus_df_to_gdf(state_gom_offshore_well_count_df) + state_gom_offshore_well_prod_gdf = enverus_df_to_gdf(state_gom_offshore_well_prod_df) + + # STEP 2.4: Well and Production Data (from NEI) + + # NEI data is used for well counts, gas well completion counts, + # gas well drilled counts, and gas production volumes for IL and IN. + + # NEI data is used for water production volumes for IL, IN, KS, OK, and PA + # as well as WV for years less than 2016. + + # FIPS codes for relevant states (each code starts with 2 distinct characters): + # IL: 17; IN: 18; KS: 20; OK: 40; PA: 42; WV: 54 + + fips_codes_df = pd.DataFrame({'state_code': ['IL', 'IN', 'KS', 'OK', 'PA', 'WV'], + 'fips_code': ['17', '18', '20', '40', '42', '54']}) + + # Function to get NEI textfile and shapefile data + def get_NEI_data(ghgi_year, data_year, file_name): + if data_year <= 2017: + # NEI textfile data (data_year <= 2017) (2011, 2014, 2016, 2017) + nei_textfile_name = f"CONUS_SA_FILES_{data_year}/{file_name}" + nei_textfile_path = os.path.join(nei_path, nei_textfile_name) + data_temp = pd.read_csv(nei_textfile_path, sep='\t', skiprows = 25) + data_temp = data_temp.drop(["!"], axis=1) + data_temp.columns = ['Code','FIPS','COL','ROW','Frac','Abs','FIPS_Total','FIPS_Running_Sum'] + data_temp = data_temp.astype({"FIPS": str}) + # if water production data (gas: 6832, oil: 6833) + if file_name == 'USA_6832_NOFILL.txt' or file_name == 'USA_6833_NOFILL.txt': + if data_year < 2016: + data_temp = (data_temp + # query states: IL, IN, KS, OK, PA, WV + .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42') | FIPS.str.startswith('54')") + .reset_index(drop=True) + ) + colmax = data_temp['COL'].max() + colmin = data_temp['COL'].min() + rowmax = data_temp['ROW'].max() + rowmin = data_temp['ROW'].min() + else: + data_temp = (data_temp + # query states: IL, IN, KS, OK, PA + .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42')") + .reset_index(drop=True) + ) + colmax = data_temp['COL'].max() + colmin = data_temp['COL'].min() + rowmax = data_temp['ROW'].max() + rowmin = data_temp['ROW'].min() + # non-water production proxies (IL, IN) + else: + data_temp = (data_temp + # query states: IL, IN + .query("FIPS.str.startswith('17') | FIPS.str.startswith('18')") + .reset_index(drop=True) + ) + colmax = data_temp['COL'].max() + colmin = data_temp['COL'].min() + rowmax = data_temp['ROW'].max() + rowmin = data_temp['ROW'].min() + # NEI reference grid shapefile with lat/lon locations + nei_reference_grid_path = os.path.join(nei_path, "NEI_Reference_Grid_LCC_to_WGS84_latlon.shp") + nei_reference_grid = (gpd.read_file(nei_reference_grid_path) + .to_crs(4326)) + nei_reference_grid = (nei_reference_grid + .assign(cellid_column = nei_reference_grid.cellid.astype(str).str[0:4].astype(int)) + .assign(cellid_row = nei_reference_grid.cellid.astype(str).str[5:].astype(int)) + .query(f"cellid_column <= {colmax} & cellid_column >= {colmin}") + .query(f"cellid_row <= {rowmax} & cellid_row >= {rowmin}") + .reset_index(drop=True) + ) + # Match lat/lon locations from reference grid to nei data + for idx in np.arange(0,len(data_temp)): + # Add in lat/lon + icol = data_temp['COL'][idx] + irow = data_temp['ROW'][idx] + match = np.where((icol == nei_reference_grid.loc[:,'cellid_column']) & (irow == nei_reference_grid.loc[:,'cellid_row']))[0][0] + match = int(match) + # data_temp.loc[idx,'Lat'] = nei_reference_grid.loc[match, 'Latitude'] + # data_temp.loc[idx,'Lon'] = nei_reference_grid.loc[match, 'Longitude'] + data_temp.loc[idx,'geometry'] = nei_reference_grid.loc[match, 'geometry'] + # Add in state_code + ifips = data_temp.loc[idx,'FIPS'][0:2] + data_temp.loc[idx,'state_code'] = fips_codes_df.loc[np.where(ifips == fips_codes_df.loc[:, 'fips_code'])[0][0],'state_code'] + data_temp = data_temp[['state_code', 'Abs', 'geometry']] + data_temp = data_temp.rename(columns={'Abs':'activity_data'}) + + else: + # NEI shapefile data (data_year > 2017) (2018, 2019, 2021, 2022) + state_geometries = state_gdf[["state_code","geometry"]] + nei_file_name = f"CONUS_SA_FILES_{data_year}" + nei_file_path = os.path.join(nei_path, nei_file_name) + data_temp = gpd.read_file(nei_file_path, layer=file_name) + data_temp = data_temp.to_crs(4326) + data_temp = gpd.tools.sjoin(data_temp, state_gdf, how="left") + + # water production data (IL, IN, KS, OK, PA) + if file_name == 'PRODUCED_WATER_GAS' or file_name == '_6832' or file_name == 'ProducedWaterGasWells': + states_to_query = ['IL', 'IN', 'KS', 'OK', 'PA'] + # non-water production proxies (IL, IN) + else: + states_to_query = ['IL', 'IN'] + + # query relevant states + data_temp = data_temp.query('state_code.isin(@states_to_query)') + + # grab activity data depending on column name (changes by year) + if data_year == 2018 or data_year == 2019 or data_year == 2020: + data_temp = data_temp[['state_code', 'ACTIVITY', 'geometry']] + data_temp = data_temp.rename(columns={'ACTIVITY':'activity_data'}) + if data_year == 2021: + data_temp = data_temp[['state_code', 'GRID_AC', 'geometry']] + data_temp = data_temp.rename(columns={'GRID_AC':'activity_data'}) + if data_year == 2022: + data_temp = data_temp[['state_code', 'GRID_ACTIV', 'geometry']] + data_temp = data_temp.rename(columns={'GRID_ACTIV':'activity_data'}) + + # convert activity data to relative emissions (idata / sum(state data)) + data_temp['rel_emi'] = data_temp.groupby(["state_code"])['activity_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + monthly_data_temp = data_temp.copy() + monthly_data_temp['rel_emi'] = monthly_data_temp['rel_emi'] * 1/12 + monthly_data_temp = monthly_data_temp.drop(columns='activity_data') + + # convert proxy data to monthly (assume 1/12 of annual proxy is assigned to each month) + nei_proxy_data = pd.DataFrame() + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + data_temp_imonth = monthly_data_temp.copy() + data_temp_imonth = data_temp_imonth.assign(year_month=str(ghgi_year)+'-'+imonth_str) + nei_proxy_data = pd.concat([nei_proxy_data,data_temp_imonth]) + nei_proxy_data = nei_proxy_data.assign(year=ghgi_year) + nei_proxy_data = (nei_proxy_data[['year', 'year_month', 'state_code', 'rel_emi', 'geometry']] + .reset_index(drop=True) + ) + return nei_proxy_data + + # NEI data year assignments + # All years use the data affiliated with their year except the following exceptions: + # 2012: use 2011 data + # 2013: use 2014 data + # 2015: use 2014 data + # 2016: use 2017 data + nei_data_years = pd.DataFrame({'year': [2012, + 2013, + 2014, + 2015, + 2016, + 2017, + 2018, + 2019, + 2020, + 2021, + 2022], + 'nei_data': [2011, + 2014, + 2014, + 2014, + 2017, + 2017, + 2018, + 2019, + 2020, + 2021, + 2022]}) + + # NEI Data Dataframes: + # Well Counts + nei_all_well_count_df = pd.DataFrame() # Active gas well (conventional + HF) counts in a given month + nei_conv_well_count_df = pd.DataFrame() # Active conventional gas well counts in a given month + nei_hf_well_count_df = pd.DataFrame() # Active HF gas well counts in a given month + # Well-Level Production Volumes + nei_all_well_prod_df = pd.DataFrame() # Active gas well (conventional + HF) gas production in a given month + nei_basin_other_prod_df = pd.DataFrame() # Gas well gas production in Other Basins in a given month + # Water Production Volumes + nei_water_prod_df = pd.DataFrame() + # Well Completions + nei_conv_well_comp_df = pd.DataFrame() # Conventional gas well completions + nei_hf_well_comp_df = pd.DataFrame() # HF gas well completions + # Drilled Gas Wells + nei_drilled_well_df = pd.DataFrame() # Gas wells drilled + + # NEI text file and shapefile names: + # Well Counts + well_count_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, + 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_698_NOFILL.txt', 'USA_698_NOFILL.txt', 'USA_698_NOFILL.txt', + 'GAS_WELLS', 'GAS_WELLS', 'GAS_WELL', '_698', 'GasWells'], + }) + # Well-Level Production Volumes + gas_prod_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, + 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_696_NOFILL.txt', 'USA_696_NOFILL.txt', 'USA_696_NOFILL.txt', + 'GAS_PRODUCTION', 'GAS_PRODUCTION', 'GAS_PRODUCTION', '_696', 'GasProduction'], + }) + # Water Production Volumes + water_prod_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, + 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_6832_NOFILL.txt', 'USA_6832_NOFILL.txt', 'USA_6832698_NOFILL.txt', + 'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', '_6832', 'ProducedWaterGasWells'], + }) + # Well Completions + comp_count_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, + 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_678_NOFILL.txt', 'USA_678_NOFILL.txt', 'USA_678_NOFILL.txt', + 'COMPLETIONS_GAS', 'COMPLETIONS_GAS', 'COMPLETIONS_GAS', '_678', 'GasWellCompletions'], + }) + # Drilled Gas Wells + spud_count_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, + 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_671_NOFILL.txt', 'USA_671_NOFILL.txt', 'USA_671_NOFILL.txt', + 'SPUD_GAS', 'SPUD_GAS', 'SPUD_GAS', '_671', 'SpudCountGasWells'], + }) + + + def get_nei_file_name(nei_data_year, nei_file_names): + nei_file_name = nei_file_names[nei_file_names['data_year'] == nei_data_year]['file_name'].values[0] + return nei_file_name + + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, well_count_file_names) + nei_all_well_count_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) + nei_all_well_count_df = pd.concat([nei_all_well_count_df, nei_all_well_count_iyear]) + # Gas Production + ifile_name = get_nei_file_name(nei_data_year, gas_prod_file_names) + nei_all_well_prod_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) + nei_all_well_prod_df = pd.concat([nei_all_well_prod_df, nei_all_well_prod_iyear]) + # Water Production + ifile_name = get_nei_file_name(nei_data_year, water_prod_file_names) + nei_water_prod_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) + nei_water_prod_df = pd.concat([nei_water_prod_df, nei_water_prod_iyear]) + # Completions Count + ifile_name = get_nei_file_name(nei_data_year, comp_count_file_names) + nei_conv_well_comp_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) + nei_conv_well_comp_df = pd.concat([nei_conv_well_comp_df, nei_conv_well_comp_iyear]) + # Spud Count + ifile_name = get_nei_file_name(nei_data_year, spud_count_file_names) + nei_drilled_well_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) + nei_drilled_well_df = pd.concat([nei_drilled_well_df, nei_drilled_well_iyear]) + + # Copy Data to Other Dataframes + nei_conv_well_count_df = nei_all_well_count_df.copy() + nei_hf_well_count_df = nei_all_well_count_df.copy() + nei_basin_other_prod_df = nei_all_well_prod_df.copy() + nei_hf_well_comp_df = nei_conv_well_comp_df.copy() + + # Add NEI Data to Enverus Data + # Well Counts + all_well_count_gdf = pd.concat([all_well_count_gdf, nei_all_well_count_df]).reset_index(drop=True) + conv_well_count_gdf = pd.concat([conv_well_count_gdf, nei_conv_well_count_df]).reset_index(drop=True) + hf_well_count_gdf = pd.concat([hf_well_count_gdf, nei_hf_well_count_df]).reset_index(drop=True) + # Well-Level Production Volumes + all_well_prod_gdf = pd.concat([all_well_prod_gdf, nei_all_well_prod_df]).reset_index(drop=True) + basin_220_prod_gdf = basin_220_prod_df.reset_index(drop=True) # No IL/IN data to add + basin_395_prod_gdf = basin_395_prod_df.reset_index(drop=True) # No IL/IN data to add + basin_430_prod_gdf = basin_430_prod_df.reset_index(drop=True) # No IL/IN data to add + basin_other_prod_gdf = pd.concat([basin_other_prod_gdf, nei_basin_other_prod_df]).reset_index(drop=True) + # Water Production Volumes + water_prod_gdf = pd.concat([water_prod_gdf, nei_water_prod_df]).reset_index(drop=True) + # Well Completions + conv_well_comp_gdf = pd.concat([conv_well_comp_gdf, nei_conv_well_comp_df]).reset_index(drop=True) + hf_well_comp_gdf = pd.concat([hf_well_comp_gdf, nei_hf_well_comp_df]).reset_index(drop=True) + # Drilled Gas Wells + drilled_well_gdf = pd.concat([drilled_well_gdf, nei_drilled_well_df]).reset_index(drop=True) + # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico + state_gom_offshore_well_count_gdf = state_gom_offshore_well_count_df.reset_index(drop=True) # No IL/IN data to add + state_gom_offshore_well_prod_gdf = state_gom_offshore_well_prod_df.reset_index(drop=True) # No IL/IN data to add + + # Output Proxy Parquet Files + all_well_count_gdf.to_parquet(all_well_count_output_path) + conv_well_count_gdf.to_parquet(conv_well_count_output_path) + hf_well_count_gdf.to_parquet(hf_well_count_output_path) + all_well_prod_gdf.to_parquet(all_well_prod_output_path) + basin_220_prod_gdf.to_parquet(basin_220_prod_output_path) + basin_395_prod_gdf.to_parquet(basin_395_prod_output_path) + basin_430_prod_gdf.to_parquet(basin_430_prod_output_path) + basin_other_prod_gdf.to_parquet(basin_other_prod_output_path) + water_prod_gdf.to_parquet(water_prod_output_path) + conv_well_comp_gdf.to_parquet(conv_well_comp_output_path) + hf_well_comp_gdf.to_parquet(hf_well_comp_output_path) + drilled_well_gdf.to_parquet(drilled_well_output_path) + state_gom_offshore_well_count_gdf.to_parquet(state_gom_offshore_well_count_output_path) + state_gom_offshore_well_prod_gdf.to_parquet(state_gom_offshore_well_prod_output_path) + return None + + + + + From 062cafb3e19c7fa34ca3cc4657ff6209ae9b91fe Mon Sep 17 00:00:00 2001 From: Hannah Lohman <68960449+haclohman@users.noreply.github.com> Date: Wed, 4 Dec 2024 18:47:17 -0500 Subject: [PATCH 4/6] Updating industrial landfills proxy to rel_emi --- .../task_industrial_landfills_proxy.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/gch4i/proxy_processing/task_industrial_landfills_proxy.py b/gch4i/proxy_processing/task_industrial_landfills_proxy.py index 5e75ffa..0e9e88b 100644 --- a/gch4i/proxy_processing/task_industrial_landfills_proxy.py +++ b/gch4i/proxy_processing/task_industrial_landfills_proxy.py @@ -99,6 +99,9 @@ def task_get_reporting_industrial_landfills_pulp_paper_proxy_data( .loc[:, ["facility_name", "state_code", "geometry", "year", "ch4_kt"]] ) + reporting_pulp_paper_gdf['rel_emi'] = reporting_pulp_paper_gdf.groupby(["state_code", "year"])['ch4_kt'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + reporting_pulp_paper_gdf = reporting_pulp_paper_gdf.drop(columns='ch4_kt') + reporting_pulp_paper_gdf.to_parquet(reporting_pulp_paper_proxy_output_path) return None @@ -270,6 +273,9 @@ def task_get_nonreporting_industrial_landfills_pulp_paper_proxy_data( .loc[:, ["state_code", "geometry", "ch4_kt"]] ) + nonreporting_pulp_paper_gdf['rel_emi'] = nonreporting_pulp_paper_gdf.groupby(["state_code"])['ch4_kt'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + nonreporting_pulp_paper_gdf = nonreporting_pulp_paper_gdf.drop(columns='ch4_kt') + nonreporting_pulp_paper_gdf.to_parquet(nonreporting_pulp_paper_proxy_output_path) return None @@ -340,6 +346,9 @@ def task_get_reporting_industrial_landfills_food_beverage_proxy_data( .loc[:, ["facility_id", "facility_name", "state_code", "geometry", "year", "ch4_kt"]] ) + reporting_food_beverage_gdf['rel_emi'] = reporting_food_beverage_gdf.groupby(["state_code"])['ch4_kt'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + reporting_food_beverage_gdf = reporting_food_beverage_gdf.drop(columns='ch4_kt') + reporting_food_beverage_gdf.to_parquet(reporting_food_beverage_proxy_output_path) return None @@ -606,6 +615,8 @@ def task_get_nonreporting_industrial_landfills_food_beverage_proxy_data( "ghgrp_match", "FRS_match", "geo_match"]) .loc[:, ["facility_id", "state_code", "geometry", "avg_waste_t"]] ) + nonreporting_food_beverage_gdf['rel_emi'] = nonreporting_food_beverage_gdf.groupby(["state_code", "year"])['avg_waste_t'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + nonreporting_food_beverage_gdf = nonreporting_food_beverage_gdf.drop(columns='avg_waste_t') nonreporting_food_beverage_gdf.to_parquet(nonreporting_food_beverage_proxy_output_path) return None From 0ff005b19f88b56008426e8cf2af4084c0f3c154 Mon Sep 17 00:00:00 2001 From: Hannah Lohman <68960449+haclohman@users.noreply.github.com> Date: Wed, 4 Dec 2024 18:47:50 -0500 Subject: [PATCH 5/6] Final federal gom offshore proxy for oil and gas production --- .../federal_gom_offshore_proxy.py | 486 +++++++++++------- 1 file changed, 293 insertions(+), 193 deletions(-) diff --git a/gch4i/proxy_processing/federal_gom_offshore_proxy.py b/gch4i/proxy_processing/federal_gom_offshore_proxy.py index 3eb351d..31468c8 100644 --- a/gch4i/proxy_processing/federal_gom_offshore_proxy.py +++ b/gch4i/proxy_processing/federal_gom_offshore_proxy.py @@ -22,6 +22,7 @@ sector_data_dir_path, max_year, min_year, + years, ) from gch4i.utils import us_state_to_abbrev @@ -31,38 +32,53 @@ @task(id="federal_gom_offshore_proxy") def task_get_federal_gom_offshore_proxy_data( state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", - boem_data_directory_path: Path = sector_data_dir_path / "boem", - ng_output_path: Annotated[Path, Product] = proxy_data_dir_path - / "federal_gom_offshore_proxy.parquet", - oil_output_path: Annotated[Path, Product] = proxy_data_dir_path - / "oil_gom_fed_proxy.parquet", + GOADS_11_path: Path = sector_data_dir_path / "boem" / "2011_Gulfwide_Platform_Inventory.accdb", + GOADS_14_path: Path = sector_data_dir_path / "boem" / "2014_Gulfwide_Platform_Inventory.accdb", + GOADS_17_path: Path = sector_data_dir_path / "boem" / "2017_Gulfwide_Platform_Inventory.accdb", + ERG_GOADSEmissions_path: Path = sector_data_dir_path / "boem" / "BOEM GEI Emissions Data_EmissionSource_2020-03-11.xlsx", + ng_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_federal_gom_offshore_proxy.parquet", + ng_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_federal_gom_offshore_proxy.parquet", + oil_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_federal_gom_offshore_proxy.parquet", ): """ # TODO: """ - state_gdf = ( - gpd.read_file(state_path) - .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] - .rename(columns=str.lower) - .rename(columns={"stusps": "state_code", "name": "state_name"}) - .astype({"statefp": int}) - # get only lower 48 + DC - .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") - .reset_index(drop=True) - .to_crs(4326) - ) + # Get and format BOEM GOM data for 2011, 2014, and 2017 - # get and format boem gom data for 2011, 2014, 2017, and 2021 - # NOTE: 2011 has tblPointER and tblPointEM but the rest of the years have one single table of data - gom_df = pd.DataFrame() + # GOADS data year assignments + # 2011 data: 2012 + # 2014 data: 2013, 2014, 2015 + # 2017 data: 2016-2022 + # 2021 data: NOT USED BY GHGI TEAM YET - CHECK FOR V4 - # 2011 GOADS Data + federal_gom_offshore_data_years = pd.DataFrame( + {'year': [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022], + 'goads_data': [2011, 2014, 2014, 2014, 2017, 2017, 2017, 2017, 2017, 2017, 2017] + }) + + # Use ERG Preprocessed data to determine if oil or gas + ERG_complex_crosswalk = (pd.read_excel( + ERG_GOADSEmissions_path, + sheet_name = "Complex Emissions by Source", + usecols = "AJ:AM", + nrows = 11143) + .rename(columns={"Year.2": "year", + "BOEM COMPLEX ID.2": "boem_complex_id", + "Oil Gas Defn FINAL.1": "oil_gas_defn", + "Major / Minor.1": "major_minor"}) + .query("year == 2011 | year == 2014 | year == 2017") + .astype({"boem_complex_id": int}) + .drop(columns="major_minor") # no longer separating major vs. minor in v3 + .replace('', np.nan) + .dropna() + .reset_index(drop=True) + ) + # 2011 GOADS Data # Read In and Format 2011 BEOM Data - gom_file_name = f"2011_Gulfwide_Platform_Inventory.accdb" - gom_file_path = os.path.join(boem_data_directory_path, gom_file_name) - driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+gom_file_path+';''' + GOADS_11_inputfile = str(GOADS_11_path) + driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+GOADS_11_inputfile+';''' conn = pyodbc.connect(driver_str) GOADS_locations = pd.read_sql("SELECT * FROM tblPointER", conn) GOADS_emissions = pd.read_sql("SELECT * FROM tblPointEM", conn) @@ -70,7 +86,7 @@ def task_get_federal_gom_offshore_proxy_data( # Format Location Data GOADS_locations = GOADS_locations[["strStateFacilityIdentifier","strEmissionReleasePointID","dblXCoordinate","dblYCoordinate"]] - #Create platform-by-platform file + # Create platform-by-platform file GOADS_locations_Unique = pd.DataFrame({'strStateFacilityIdentifier':GOADS_locations['strStateFacilityIdentifier'].unique()}) GOADS_locations_Unique['lon'] = 0.0 GOADS_locations_Unique['lat'] = 0.0 @@ -82,192 +98,276 @@ def task_get_federal_gom_offshore_proxy_data( GOADS_locations_Unique.loc[iplatform,'lat',] = GOADS_locations['dblYCoordinate'][match_platform] GOADS_locations_Unique.loc[iplatform,'strEmissionReleasePointID'] = GOADS_locations['strEmissionReleasePointID'][match_platform][:3] - GOADS_locations_Unique.reset_index(inplace=True, drop=True) - #display(GOADS_locations_Unique) - - #print(GOADS_emissions.columns) - #Format Emissions Data (clean lease data string) - GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode","dblEmissionNumericValue","BOEM-MONTH", - "BOEM-LEASE_NUM","BOEM-COMPLEX_ID"]] - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('OCS','') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('-','') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace(' ','') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G1477','G01477') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G73','00073') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G605','00605') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G72','00072') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G599','00599') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G7155','G07155') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G2357','G02357') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G4921','G04921') - GOADS_emissions['Emis_tg'] = 0.0 - GOADS_emissions['Emis_tg'] = 9.0718474E-7 * GOADS_emissions['dblEmissionNumericValue'] #convert short tons to Tg - GOADS_emissions = GOADS_emissions[GOADS_emissions['strPollutantCode'] == 'CH4'] - GOADS_emissions.reset_index(inplace=True, drop=True) - - #display(GOADS_emissions) + GOADS_locations_Unique = (GOADS_locations_Unique + .drop(columns='strEmissionReleasePointID') + .replace('', np.nan) + .dropna() + .reset_index(drop=True)) - # Use ERG Preprocessed data to determine if major or minor and oil or gas - ERG_complex_crosswalk = pd.read_excel(ERG_GOADSEmissions_inputfile, sheet_name = "Complex Emissions by Source", usecols = "AJ:AM", nrows = 11143) - #display(ERG_complex_crosswalk) + # Format Emissions Data (clean lease data string) + GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode", + "dblEmissionNumericValue","BOEM-MONTH", + "BOEM-COMPLEX_ID"]] + GOADS_emissions = (GOADS_emissions + .query("strPollutantCode == 'CH4'") + .assign(Emis_tg = 0.0) + .assign(Emis_tg = lambda df: 9.0718474E-7 * df['dblEmissionNumericValue']) #convert short tons to Tg + .rename(columns={"BOEM-COMPLEX_ID": "boem_complex_id"}) + .astype({"boem_complex_id": int}) + .drop(columns={"strPollutantCode", "dblEmissionNumericValue"}) + .replace('', np.nan) + .dropna() + .reset_index(drop=True) + ) - # add data to map array, for the closest year to 2011 - year_diff = [abs(x - 2011) for x in year_range] - iyear = year_diff.index(min(year_diff)) + # Select 2011 data from ERG complex crosswalk + ERG_complex_crosswalk_2011 = ERG_complex_crosswalk.copy().query('year == 2011').reset_index(drop=True) - #assign oil vs gas by lease/complex ID - GOADS_emissions['LEASE_TYPE'] ='' - GOADS_emissions['MAJOR_STRUC'] ='' - for istruc in np.arange(0,len(GOADS_emissions)): - imatch = np.where(np.logical_and(ERG_complex_crosswalk['BOEM COMPLEX ID.2']==int(GOADS_emissions['BOEM-COMPLEX_ID'][istruc]),\ - ERG_complex_crosswalk['Year.2'] == 2011)) - if np.size(imatch) >0: - imatch = imatch[0][0] - GOADS_emissions.loc[istruc,'LEASE_TYPE'] = ERG_complex_crosswalk['Oil Gas Defn FINAL.1'][imatch] - GOADS_emissions.loc[istruc,'MAJOR_STRUC'] = ERG_complex_crosswalk['Major / Minor.1'][imatch] - else: - print(istruc, GOADS_emissions['BOEM-COMPLEX_ID'][istruc]) + # Join locations, emissions, and complex types together + federal_gom_offshore_2011 = (GOADS_emissions + .set_index("boem_complex_id") + .join(ERG_complex_crosswalk_2011.set_index("boem_complex_id")) + .reset_index() + .set_index("strStateFacilityIdentifier") + .join(GOADS_locations_Unique.set_index("strStateFacilityIdentifier")) + .reset_index() + .astype({"BOEM-MONTH": str}) + .assign(month=lambda df: df['BOEM-MONTH'].astype(str).str.zfill(2)) + .assign(state_code='FO') + .drop(columns={'strStateFacilityIdentifier', 'BOEM-MONTH'}) + ) + federal_gom_offshore_2011_gdf = ( + gpd.GeoDataFrame( + federal_gom_offshore_2011, + geometry=gpd.points_from_xy( + federal_gom_offshore_2011["lon"], + federal_gom_offshore_2011["lat"], + crs=4326 + ) + ) + .drop(columns=["lat", "lon"]) + .loc[:, ["boem_complex_id", "year", "month", "state_code", "Emis_tg", "geometry", "oil_gas_defn"]] + ) - # for all gas platforms, match the platform to the emissions - if GOADS_emissions['LEASE_TYPE'][istruc] =='Gas': - match_platform = np.where(GOADS_locations_Unique.strStateFacilityIdentifier==GOADS_emissions['strStateFacilityIdentifier'][istruc])[0][0] - ilat = int((GOADS_locations_Unique['lat'][match_platform] - Lat_low)/Res01) - ilon = int((GOADS_locations_Unique['lon'][match_platform] - Lon_left)/Res01) - imonth = GOADS_emissions['BOEM-MONTH'][istruc]-1 #dict is 1-12, not 0-11 - if GOADS_emissions['MAJOR_STRUC'][istruc] =='Major': - Map_GOADSmajor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc] - else: - Map_GOADSminor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc] - - - # sum complexes and emissions for diagnostic - majcplx = GOADS_emissions[(GOADS_emissions['MAJOR_STRUC']=='Major')] - majcplx = majcplx[majcplx['LEASE_TYPE'] =='Gas'] - num_majcplx = majcplx['BOEM-COMPLEX_ID'].unique() - #print(np.shape(num_majcplx)) - mincplx = GOADS_emissions[GOADS_emissions['MAJOR_STRUC']=='Minor'] - mincplx = mincplx[mincplx['LEASE_TYPE'] =='Gas'] - num_mincplx = mincplx['BOEM-COMPLEX_ID'].unique() - #print(np.size(num_mincplx)) - del GOADS_emissions - print('Number of Major Gas Complexes: ',(np.size(num_majcplx))) - print('Emissions (Tg): ',np.sum(Map_GOADSmajor_emissions[:,:,iyear,:])) - print('Number of Minor Gas Complexes: ',(np.size(num_mincplx))) - print('Emissions (Tg): ',np.sum(Map_GOADSminor_emissions[:,:,iyear,:])) + # Separate out ng and oil + ng_federal_gom_offshore_2011_gdf = (federal_gom_offshore_2011_gdf + .query("oil_gas_defn == 'Gas'") + .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)) + .drop(columns={'Emis_tg', 'oil_gas_defn'}) + .reset_index(drop=True) + ) + oil_federal_gom_offshore_2011_gdf = (federal_gom_offshore_2011_gdf + .query("oil_gas_defn == 'Oil'") + .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)) + .drop(columns={'Emis_tg', 'oil_gas_defn'}) + .reset_index(drop=True) + ) + # 2014 GOADS Data + # Read In and Format 2014 BEOM Data + GOADS_14_inputfile = str(GOADS_14_path) + driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+GOADS_14_inputfile+';''' + conn = pyodbc.connect(driver_str) + GOADS_emissions = pd.read_sql("SELECT * FROM 2014_Gulfwide_Platform_20161102", conn) + conn.close() + # Format Emissions Data (clean lease data string) + GOADS_emissions = GOADS_emissions[["X_COORDINATE", "Y_COORDINATE", "POLLUTANT_CODE", + "EMISSIONS_VALUE", "MONTH", "COMPLEX_ID"]] + GOADS_emissions = (GOADS_emissions + .query("POLLUTANT_CODE == 'CH4'") + .assign(Emis_tg = 0.0) + .assign(Emis_tg = lambda df: 9.0718474E-7 * df['EMISSIONS_VALUE']) #convert short tons to Tg + .rename(columns={"COMPLEX_ID": "boem_complex_id"}) + .astype({"boem_complex_id": int}) + .drop(columns={"POLLUTANT_CODE", "EMISSIONS_VALUE"}) + .replace('', np.nan) + .dropna() + .reset_index(drop=True) + ) + # Select 2014 data from ERG complex crosswalk + ERG_complex_crosswalk_2014 = ERG_complex_crosswalk.copy().query('year == 2014').reset_index(drop=True) - gom_data_years = ['2011', '2014', '2017', '2021'] - for idatayear in gom_data_years: - gom_file_name = f"{idatayear}_Gulfwide_Platform_Inventory.accdb" - gom_file_path = os.path.join(boem_data_directory_path, gom_file_name) - driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+gom_file_path+';''' - conn = pyodbc.connect(driver_str) - GOADS_locations = pd.read_sql("SELECT * FROM tblPointER", conn) - GOADS_emissions = pd.read_sql("SELECT * FROM tblPointEM", conn) - conn.close() - - # Format Location Data - GOADS_locations = GOADS_locations[["strStateFacilityIdentifier","strEmissionReleasePointID","dblXCoordinate","dblYCoordinate"]] - #Create platform-by-platform file - GOADS_locations_Unique = pd.DataFrame({'strStateFacilityIdentifier':GOADS_locations['strStateFacilityIdentifier'].unique()}) - GOADS_locations_Unique['lon'] = 0.0 - GOADS_locations_Unique['lat'] = 0.0 - GOADS_locations_Unique['strEmissionReleasePointID'] = '' + # Join locations, emissions, and complex types together + federal_gom_offshore_2014 = (GOADS_emissions + .set_index("boem_complex_id") + .join(ERG_complex_crosswalk_2014.set_index("boem_complex_id")) + .reset_index() + .astype({"MONTH": str}) + .assign(state_code='FO') + .rename(columns={'X_COORDINATE': 'lon', 'Y_COORDINATE': 'lat', 'MONTH': 'month'}) + ) + + # Correct months to be numeric digits + month_to_mm_df = pd.DataFrame( + {'month': ['January', 'February', 'March', 'April', 'May', 'June', 'July', + 'August', 'September', 'October', 'November', 'December'], + 'mm': ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'] + }) + federal_gom_offshore_2014 = (federal_gom_offshore_2014 + .merge(month_to_mm_df, how='left') + .drop(columns='month') + .rename(columns={'mm': 'month'}) + ) - for iplatform in np.arange(len(GOADS_locations_Unique)): - match_platform = np.where(GOADS_locations['strStateFacilityIdentifier'] == GOADS_locations_Unique['strStateFacilityIdentifier'][iplatform])[0][0] - GOADS_locations_Unique.loc[iplatform,'lon',] = GOADS_locations['dblXCoordinate'][match_platform] - GOADS_locations_Unique.loc[iplatform,'lat',] = GOADS_locations['dblYCoordinate'][match_platform] - GOADS_locations_Unique.loc[iplatform,'strEmissionReleasePointID'] = GOADS_locations['strEmissionReleasePointID'][match_platform][:3] + federal_gom_offshore_2014_gdf = ( + gpd.GeoDataFrame( + federal_gom_offshore_2014, + geometry=gpd.points_from_xy( + federal_gom_offshore_2014["lon"], + federal_gom_offshore_2014["lat"], + crs=4326 + ) + ) + .drop(columns=["lat", "lon"]) + .loc[:, ["boem_complex_id", "year", "month", "state_code", "Emis_tg", "geometry", "oil_gas_defn"]] + ) - GOADS_locations_Unique.reset_index(inplace=True, drop=True) - #display(GOADS_locations_Unique) + # Separate out ng and oil + ng_federal_gom_offshore_2014_gdf = (federal_gom_offshore_2014_gdf + .query("oil_gas_defn == 'Gas'") + .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)) + .drop(columns={'Emis_tg', 'oil_gas_defn'}) + .reset_index(drop=True) + ) + oil_federal_gom_offshore_2014_gdf = (federal_gom_offshore_2014_gdf + .query("oil_gas_defn == 'Oil'") + .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)) + .drop(columns={'Emis_tg', 'oil_gas_defn'}) + .reset_index(drop=True) + ) - #print(GOADS_emissions.columns) - #Format Emissions Data (clean lease data string) - GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode","dblEmissionNumericValue","BOEM-MONTH", - "BOEM-LEASE_NUM","BOEM-COMPLEX_ID"]] - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('OCS','') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('-','') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace(' ','') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G1477','G01477') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G73','00073') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G605','00605') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G72','00072') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G599','00599') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G7155','G07155') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G2357','G02357') - GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G4921','G04921') - GOADS_emissions['Emis_tg'] = 0.0 - GOADS_emissions['Emis_tg'] = 9.0718474E-7 * GOADS_emissions['dblEmissionNumericValue'] #convert short tons to Tg - GOADS_emissions = GOADS_emissions[GOADS_emissions['strPollutantCode'] == 'CH4'] - GOADS_emissions.reset_index(inplace=True, drop=True) + # 2017 GOADS Data + # Read In and Format 2017 BEOM Data + GOADS_17_inputfile = str(GOADS_17_path) + driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+GOADS_17_inputfile+';''' + conn = pyodbc.connect(driver_str) + GOADS_emissions = pd.read_sql("SELECT * FROM 2017_Gulfwide_Platform_20190705_CAP_GHG", conn) + conn.close() - #display(GOADS_emissions) + # Format Emissions Data (clean lease data string) + GOADS_emissions = GOADS_emissions[["X_COORDINATE", "Y_COORDINATE", "POLLUTANT_CODE", + "EMISSIONS_VALUE", "Month", "COMPLEX_ID"]] + GOADS_emissions = (GOADS_emissions + .query("POLLUTANT_CODE == 'CH4'") + .assign(Emis_tg = 0.0) + .assign(Emis_tg = lambda df: 9.0718474E-7 * df['EMISSIONS_VALUE']) #convert short tons to Tg + .rename(columns={"COMPLEX_ID": "boem_complex_id"}) + .astype({"boem_complex_id": int}) + .drop(columns={"POLLUTANT_CODE", "EMISSIONS_VALUE"}) + .replace('', np.nan) + .dropna() + .reset_index(drop=True) + ) - # Use ERG Preprocessed data to determine if major or minor and oil or gas - ERG_complex_crosswalk = pd.read_excel(ERG_GOADSEmissions_inputfile, sheet_name = "Complex Emissions by Source", usecols = "AJ:AM", nrows = 11143) + # Select 2017 data from ERG complex crosswalk + ERG_complex_crosswalk_2017 = ERG_complex_crosswalk.copy().query('year == 2017').reset_index(drop=True) - # add data to map array, for the closest year to 2011 - year_diff = [abs(x - 2011) for x in year_range] - iyear = year_diff.index(min(year_diff)) + # Join locations, emissions, and complex types together + federal_gom_offshore_2017 = (GOADS_emissions + .set_index("boem_complex_id") + .join(ERG_complex_crosswalk_2017.set_index("boem_complex_id")) + .reset_index() + .astype({"Month": str}) + .assign(state_code='FO') + .rename(columns={'X_COORDINATE': 'lon', 'Y_COORDINATE': 'lat', 'Month': 'month'}) + ) + + # Correct months to be numeric digits + month_to_mm_df = pd.DataFrame( + {'month': ['January', 'February', 'March', 'April', 'May', 'June', 'July', + 'August', 'September', 'October', 'November', 'December'], + 'mm': ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'] + }) + federal_gom_offshore_2017 = (federal_gom_offshore_2017 + .merge(month_to_mm_df, how='left') + .drop(columns='month') + .rename(columns={'mm': 'month'}) + ) - #assign oil vs gas by lease/complex ID - GOADS_emissions['LEASE_TYPE'] ='' - GOADS_emissions['MAJOR_STRUC'] ='' - for istruc in np.arange(0,len(GOADS_emissions)): - imatch = np.where(np.logical_and(ERG_complex_crosswalk['BOEM COMPLEX ID.2']==int(GOADS_emissions['BOEM-COMPLEX_ID'][istruc]),\ - ERG_complex_crosswalk['Year.2'] == 2011)) - if np.size(imatch) >0: - imatch = imatch[0][0] - GOADS_emissions.loc[istruc,'LEASE_TYPE'] = ERG_complex_crosswalk['Oil Gas Defn FINAL.1'][imatch] - GOADS_emissions.loc[istruc,'MAJOR_STRUC'] = ERG_complex_crosswalk['Major / Minor.1'][imatch] - else: - print(istruc, GOADS_emissions['BOEM-COMPLEX_ID'][istruc]) + federal_gom_offshore_2017_gdf = ( + gpd.GeoDataFrame( + federal_gom_offshore_2017, + geometry=gpd.points_from_xy( + federal_gom_offshore_2017["lon"], + federal_gom_offshore_2017["lat"], + crs=4326 + ) + ) + .drop(columns=["lat", "lon"]) + .loc[:, ["boem_complex_id", "year", "month", "state_code", "Emis_tg", "geometry", "oil_gas_defn"]] + ) - # for all gas platforms, match the platform to the emissions - if GOADS_emissions['LEASE_TYPE'][istruc] =='Gas': - match_platform = np.where(GOADS_locations_Unique.strStateFacilityIdentifier==GOADS_emissions['strStateFacilityIdentifier'][istruc])[0][0] - ilat = int((GOADS_locations_Unique['lat'][match_platform] - Lat_low)/Res01) - ilon = int((GOADS_locations_Unique['lon'][match_platform] - Lon_left)/Res01) - imonth = GOADS_emissions['BOEM-MONTH'][istruc]-1 #dict is 1-12, not 0-11 - if GOADS_emissions['MAJOR_STRUC'][istruc] =='Major': - Map_GOADSmajor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc] - else: - Map_GOADSminor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc] - - - # sum complexes and emissions for diagnostic - majcplx = GOADS_emissions[(GOADS_emissions['MAJOR_STRUC']=='Major')] - majcplx = majcplx[majcplx['LEASE_TYPE'] =='Gas'] - num_majcplx = majcplx['BOEM-COMPLEX_ID'].unique() - #print(np.shape(num_majcplx)) - mincplx = GOADS_emissions[GOADS_emissions['MAJOR_STRUC']=='Minor'] - mincplx = mincplx[mincplx['LEASE_TYPE'] =='Gas'] - num_mincplx = mincplx['BOEM-COMPLEX_ID'].unique() - #print(np.size(num_mincplx)) - del GOADS_emissions - print('Number of Major Gas Complexes: ',(np.size(num_majcplx))) - print('Emissions (Tg): ',np.sum(Map_GOADSmajor_emissions[:,:,iyear,:])) - print('Number of Minor Gas Complexes: ',(np.size(num_mincplx))) - print('Emissions (Tg): ',np.sum(Map_GOADSminor_emissions[:,:,iyear,:])) + # Separate out ng and oil + ng_federal_gom_offshore_2017_gdf = (federal_gom_offshore_2017_gdf + .query("oil_gas_defn == 'Gas'") + .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)) + .drop(columns={'Emis_tg', 'oil_gas_defn'}) + .reset_index(drop=True) + ) + oil_federal_gom_offshore_2017_gdf = (federal_gom_offshore_2017_gdf + .query("oil_gas_defn == 'Oil'") + .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)) + .drop(columns={'Emis_tg', 'oil_gas_defn'}) + .reset_index(drop=True) + ) + # Build complete proxy (2012-2022) + ng_federal_gom_offshore_gdf = gpd.GeoDataFrame() + oil_federal_gom_offshore_gdf = gpd.GeoDataFrame() + for iyear in years: + data_year = federal_gom_offshore_data_years[federal_gom_offshore_data_years['year'] == iyear]['goads_data'].values[0] + if data_year == 2011: + ng_temp_data = (ng_federal_gom_offshore_2011_gdf + .copy() + .assign(year = iyear) + .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month']) + ) + oil_temp_data = (oil_federal_gom_offshore_2011_gdf + .copy() + .assign(year = iyear) + .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month']) + ) + if data_year == 2014: + ng_temp_data = (ng_federal_gom_offshore_2014_gdf + .copy() + .assign(year = iyear) + .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month']) + ) + oil_temp_data = (oil_federal_gom_offshore_2014_gdf + .copy() + .assign(year = iyear) + + ) + if data_year == 2017: + ng_temp_data = (ng_federal_gom_offshore_2017_gdf + .copy() + .assign(year = iyear) + .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month']) + ) + oil_temp_data = (oil_federal_gom_offshore_2017_gdf + .copy() + .assign(year = iyear) + .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month']) + ) + ng_federal_gom_offshore_gdf = pd.concat([ng_federal_gom_offshore_gdf, ng_temp_data]) + oil_federal_gom_offshore_gdf = pd.concat([oil_federal_gom_offshore_gdf, oil_temp_data]) - # Create proxy gdf - proxy_gdf = ( - gpd.GeoDataFrame( - gb_stations_df, - geometry=gpd.points_from_xy( - gb_stations_df["lon"], - gb_stations_df["lat"], - crs=4326, - ), - ) - .drop(columns=["lat", "lon"]) - .loc[:, ["facility_name", "state_code", "geometry"]] - ) + ng_federal_gom_offshore_gdf = (ng_federal_gom_offshore_gdf + .loc[:, ["boem_complex_id", "year", "month", + "year_month", "state_code", "geometry", + "rel_emi"]] + .reset_index(drop=True) + ) + oil_federal_gom_offshore_gdf = (oil_federal_gom_offshore_gdf + .loc[:, ["boem_complex_id", "year", "month", + "year_month", "state_code", "geometry", + "rel_emi"]] + .reset_index(drop=True) + ) + + ng_federal_gom_offshore_gdf.to_parquet(ng_output_path) + oil_federal_gom_offshore_gdf.to_parquet(oil_output_path) - proxy_gdf.to_parquet(output_path) return None From e6a8aa3f484985638d369e0003f9447e76b2d2d9 Mon Sep 17 00:00:00 2001 From: Hannah Lohman <68960449+haclohman@users.noreply.github.com> Date: Fri, 20 Dec 2024 13:55:32 -0500 Subject: [PATCH 6/6] Oil and NG exploration and production proxies Complete code for oil and natural gas exploration and production proxies --- .../ng_oil_production_utils.py | 275 +++++ .../task_enverus_di_prism_data_processing.py | 305 +++++ .../task_ng_all_well_count_proxy.py | 175 +++ .../task_ng_all_well_prod_proxy.py | 176 +++ .../task_ng_basin_220_prod_proxy.py | 153 +++ .../task_ng_basin_395_prod_proxy.py | 153 +++ .../task_ng_basin_430_prod_proxy.py | 153 +++ .../task_ng_basin_other_prod_proxy.py | 177 +++ .../task_ng_conv_well_comp_proxy.py | 179 +++ .../task_ng_conv_well_count_proxy.py | 176 +++ .../task_ng_drilled_well_proxy.py | 180 +++ .../task_ng_hf_well_comp_proxy.py | 179 +++ .../task_ng_hf_well_count_proxy.py | 177 +++ ...task_ng_oil_federal_gom_offshore_proxy.py} | 4 +- .../task_ng_oil_state_gom_offshore_proxy.py | 278 +++++ .../task_ng_production_proxy.py | 1007 ----------------- .../task_ng_water_prod_proxy.py | 194 ++++ .../task_ng_well_blowout_proxy.py | 75 ++ .../task_oil_all_well_count_proxy.py | 176 +++ .../task_oil_all_well_prod_proxy.py | 177 +++ .../task_oil_basin_220_prod_proxy.py | 154 +++ .../task_oil_basin_360_prod_proxy.py | 154 +++ .../task_oil_basin_395_prod_proxy.py | 154 +++ .../task_oil_basin_430_prod_proxy.py | 154 +++ .../task_oil_basin_other_prod_proxy.py | 178 +++ .../task_oil_conv_well_comp_proxy.py | 180 +++ .../task_oil_conv_well_count_proxy.py | 177 +++ .../task_oil_drilled_well_proxy.py | 181 +++ .../task_oil_hf_well_comp_proxy.py | 180 +++ .../task_oil_hf_well_count_proxy.py | 177 +++ .../task_oil_water_prod_proxy.py | 195 ++++ .../task_oil_well_avg_proxy.py | 69 ++ 32 files changed, 5313 insertions(+), 1009 deletions(-) create mode 100644 gch4i/proxy_processing/ng_oil_production_utils.py create mode 100644 gch4i/proxy_processing/task_enverus_di_prism_data_processing.py create mode 100644 gch4i/proxy_processing/task_ng_all_well_count_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_all_well_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_basin_220_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_basin_395_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_basin_430_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_basin_other_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_conv_well_comp_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_conv_well_count_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_drilled_well_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_hf_well_comp_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_hf_well_count_proxy.py rename gch4i/proxy_processing/{federal_gom_offshore_proxy.py => task_ng_oil_federal_gom_offshore_proxy.py} (99%) create mode 100644 gch4i/proxy_processing/task_ng_oil_state_gom_offshore_proxy.py delete mode 100644 gch4i/proxy_processing/task_ng_production_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_water_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_ng_well_blowout_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_all_well_count_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_all_well_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_basin_220_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_basin_360_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_basin_395_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_basin_430_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_basin_other_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_conv_well_comp_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_conv_well_count_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_drilled_well_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_hf_well_comp_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_hf_well_count_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_water_prod_proxy.py create mode 100644 gch4i/proxy_processing/task_oil_well_avg_proxy.py diff --git a/gch4i/proxy_processing/ng_oil_production_utils.py b/gch4i/proxy_processing/ng_oil_production_utils.py new file mode 100644 index 0000000..2b89ed8 --- /dev/null +++ b/gch4i/proxy_processing/ng_oil_production_utils.py @@ -0,0 +1,275 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev + +# File Paths +state_path: Path = global_data_dir_path / "tl_2020_us_state.zip" +enverus_production_path: Path = sector_data_dir_path / "enverus/production" +intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs" +nei_path: Path = sector_data_dir_path / "nei_og" + +# State ANSI data +state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) +) + + +# Function to calculate relative emissions for Enverus data +def calc_enverus_rel_emi(df): + df['rel_emi'] = df.groupby(["state_code", "year"])['proxy_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + df = df.drop(columns='proxy_data') + return df + + +# function to format proxy data into geodataframes +def enverus_df_to_gdf(df): + gdf = ( + gpd.GeoDataFrame( + df, + geometry=gpd.points_from_xy( + df["longitude"], + df["latitude"], + crs=4326 + ) + ) + .drop(columns=["latitude", "longitude"]) + .loc[:, ["year", "year_month", "state_code", "rel_emi", "geometry"]] + ) + return gdf + + +# NEI FIPS codes +fips_codes_df = pd.DataFrame({'state_code': ['IL', 'IN', 'KS', 'OK', 'PA', 'WV'], + 'fips_code': ['17', '18', '20', '40', '42', '54']}) + +# NEI data year assignments +# All years use the data affiliated with their year except the following exceptions: + # 2012: use 2011 data + # 2013: use 2014 data + # 2015: use 2014 data + # 2016: use 2017 data +nei_data_years = pd.DataFrame( + {'year': [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022], + 'nei_data': [2011, 2014, 2014, 2014, 2017, 2017, 2018, 2019, 2020, 2021, 2022] + }) + +# NEI text file and shapefile names: +# Natural Gas Well Counts +ng_well_count_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_698_NOFILL.txt', 'USA_698_NOFILL.txt', 'USA_698_NOFILL.txt', + 'GAS_WELLS', 'GAS_WELLS', 'GAS_WELL', '_698', 'GasWells'], + }) +# Natural Gas Well-Level Production Volumes +ng_gas_prod_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_696_NOFILL.txt', 'USA_696_NOFILL.txt', 'USA_696_NOFILL.txt', + 'GAS_PRODUCTION', 'GAS_PRODUCTION', 'GAS_PRODUCTION', '_696', 'GasProduction'], + }) +# Natural Gas Water Production Volumes +ng_water_prod_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_6832_NOFILL.txt', 'USA_6832_NOFILL.txt', 'USA_6832_NOFILL.txt', + 'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', '_6832', 'ProducedWaterGasWells'], + }) +# Natural Gas Well Completions +ng_comp_count_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_678_NOFILL.txt', 'USA_678_NOFILL.txt', 'USA_678_NOFILL.txt', + 'COMPLETIONS_GAS', 'COMPLETIONS_GAS', 'COMPLETIONS_GAS', '_678', 'GasWellCompletions'], + }) +# Natural Gas Drilled Gas Wells +ng_spud_count_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_671_NOFILL.txt', 'USA_671_NOFILL.txt', 'USA_671_NOFILL.txt', + 'SPUD_GAS', 'SPUD_GAS', 'SPUD_GAS', '_671', 'SpudCountGasWells'], + }) +# Oil Well Counts +oil_well_count_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_695_NOFILL.txt', 'USA_695_NOFILL.txt', 'USA_695_NOFILL.txt', + 'OIL_WELLS', 'OIL_WELLS', 'OIL_WELL', '_695', 'OILWells'], + }) +# Oil Well-Level Production Volumes +oil_oil_prod_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_694_NOFILL.txt', 'USA_694_NOFILL.txt', 'USA_694_NOFILL.txt', + 'OIL_PRODUCTION', 'OIL_PRODUCTION', 'OIL_PRODUCTION', '_694', 'OilProduction'], + }) +# Oil Water Production Volumes +oil_water_prod_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_6833_NOFILL.txt', 'USA_6833_NOFILL.txt', 'USA_6833_NOFILL.txt', + 'PRODUCED_WATER_OIL', 'PRODUCED_WATER_OIL', 'PRODUCED_WATER_OIL', '_6833', 'ProducedWaterOilWells'], + }) +# Oil Well Completions +oil_comp_count_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_685_NOFILL.txt', 'USA_685_NOFILL.txt', 'USA_685_NOFILL.txt', + 'COMPLETIONS_OIL', 'COMPLETIONS_OIL', 'COMPLETIONS_OIL', '_685', 'OilWellCompletions'], + }) +# Oil Drilled Gas Wells +oil_spud_count_file_names = pd.DataFrame({ + 'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022], + 'file_name': ['USA_681_NOFILL.txt', 'USA_681_NOFILL.txt', 'USA_681_NOFILL.txt', + 'SPUD_OIL', 'SPUD_OIL', 'SPUD_OIL', '_681', 'SpudCountOilWells'], + }) + + +# Function to get the specific file name for a given year +def get_nei_file_name(nei_data_year, nei_file_names): + nei_file_name = nei_file_names[nei_file_names['data_year'] == nei_data_year]['file_name'].values[0] + return nei_file_name + + +# Function to get raw NEI textfile and shapefile data for the specific proxy of interest +def get_raw_NEI_data(ghgi_year, data_year, file_name): + if data_year <= 2017: + # NEI textfile data (data_year <= 2017) (2011, 2014, 2016, 2017) + nei_textfile_name = f"CONUS_SA_FILES_{data_year}/{file_name}" + nei_textfile_path = os.path.join(nei_path, nei_textfile_name) + data_temp = pd.read_csv(nei_textfile_path, sep='\t', skiprows = 25) + data_temp = data_temp.drop(["!"], axis=1) + data_temp.columns = ['Code','FIPS','COL','ROW','Frac','Abs','FIPS_Total','FIPS_Running_Sum'] + data_temp = data_temp.astype({"FIPS": str}) + # if water production data (gas: 6832, oil: 6833) + if file_name == 'USA_6832_NOFILL.txt' or file_name == 'USA_6833_NOFILL.txt': + if data_year < 2016: + data_temp = (data_temp + # query states: IL, IN, KS, OK, PA, WV + .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42') | FIPS.str.startswith('54')") + .reset_index(drop=True) + ) + colmax = data_temp['COL'].max() + colmin = data_temp['COL'].min() + rowmax = data_temp['ROW'].max() + rowmin = data_temp['ROW'].min() + else: + data_temp = (data_temp + # query states: IL, IN, KS, OK, PA + .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42')") + .reset_index(drop=True) + ) + colmax = data_temp['COL'].max() + colmin = data_temp['COL'].min() + rowmax = data_temp['ROW'].max() + rowmin = data_temp['ROW'].min() + # non-water production proxies (IL, IN) + else: + data_temp = (data_temp + # query states: IL, IN + .query("FIPS.str.startswith('17') | FIPS.str.startswith('18')") + .reset_index(drop=True) + ) + colmax = data_temp['COL'].max() + colmin = data_temp['COL'].min() + rowmax = data_temp['ROW'].max() + rowmin = data_temp['ROW'].min() + # NEI reference grid shapefile with lat/lon locations + nei_reference_grid_path = os.path.join(nei_path, "NEI_Reference_Grid_LCC_to_WGS84_latlon.shp") + nei_reference_grid = (gpd.read_file(nei_reference_grid_path) + .to_crs(4326)) + nei_reference_grid = (nei_reference_grid + .assign(cellid_column = nei_reference_grid.cellid.astype(str).str[0:4].astype(int)) + .assign(cellid_row = nei_reference_grid.cellid.astype(str).str[5:].astype(int)) + .query(f"cellid_column <= {colmax} & cellid_column >= {colmin}") + .query(f"cellid_row <= {rowmax} & cellid_row >= {rowmin}") + .reset_index(drop=True) + ) + # Match lat/lon locations from reference grid to nei data + for idx in np.arange(0,len(data_temp)): + # Add in lat/lon + icol = data_temp['COL'][idx] + irow = data_temp['ROW'][idx] + match = np.where((icol == nei_reference_grid.loc[:,'cellid_column']) & (irow == nei_reference_grid.loc[:,'cellid_row']))[0][0] + match = int(match) + # data_temp.loc[idx,'Lat'] = nei_reference_grid.loc[match, 'Latitude'] + # data_temp.loc[idx,'Lon'] = nei_reference_grid.loc[match, 'Longitude'] + data_temp.loc[idx,'geometry'] = nei_reference_grid.loc[match, 'geometry'] + # Add in state_code + ifips = data_temp.loc[idx,'FIPS'][0:2] + data_temp.loc[idx,'state_code'] = fips_codes_df.loc[np.where(ifips == fips_codes_df.loc[:, 'fips_code'])[0][0],'state_code'] + data_temp = data_temp[['state_code', 'Abs', 'geometry']] + data_temp = data_temp.rename(columns={'Abs':'activity_data'}) + + else: + # NEI shapefile data (data_year > 2017) (2018, 2019, 2021, 2022) + state_geometries = state_gdf[["state_code","geometry"]] + nei_file_name = f"CONUS_SA_FILES_{data_year}" + nei_file_path = os.path.join(nei_path, nei_file_name) + data_temp = gpd.read_file(nei_file_path, layer=file_name) + data_temp = data_temp.to_crs(4326) + data_temp = gpd.tools.sjoin(data_temp, state_gdf, how="left") + + # water production data (IL, IN, KS, OK, PA) + if file_name == 'PRODUCED_WATER_GAS' or file_name == '_6832' or file_name == 'ProducedWaterGasWells': + states_to_query = ['IL', 'IN', 'KS', 'OK', 'PA'] + # non-water production proxies (IL, IN) + else: + states_to_query = ['IL', 'IN'] + + # query relevant states + data_temp = data_temp.query('state_code.isin(@states_to_query)') + + # grab activity data depending on column name (changes by year) + if data_year == 2018 or data_year == 2019 or data_year == 2020: + data_temp = data_temp[['state_code', 'ACTIVITY', 'geometry']] + data_temp = data_temp.rename(columns={'ACTIVITY':'activity_data'}) + if data_year == 2021: + data_temp = data_temp[['state_code', 'GRID_AC', 'geometry']] + data_temp = data_temp.rename(columns={'GRID_AC':'activity_data'}) + if data_year == 2022: + data_temp = data_temp[['state_code', 'GRID_ACTIV', 'geometry']] + data_temp = data_temp.rename(columns={'GRID_ACTIV':'activity_data'}) + + # convert activity data to relative emissions (idata / sum(state data)) + data_temp['rel_emi'] = data_temp.groupby(["state_code"])['activity_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + monthly_data_temp = data_temp.copy() + monthly_data_temp['rel_emi'] = monthly_data_temp['rel_emi'] * 1/12 + monthly_data_temp = monthly_data_temp.drop(columns='activity_data') + + # convert proxy data to monthly (assume 1/12 of annual proxy is assigned to each month) + nei_proxy_data = pd.DataFrame() + for imonth in range(1, 13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + data_temp_imonth = monthly_data_temp.copy() + data_temp_imonth = data_temp_imonth.assign(year_month=str(ghgi_year)+'-'+imonth_str) + nei_proxy_data = pd.concat([nei_proxy_data, data_temp_imonth]) + nei_proxy_data = nei_proxy_data.assign(year=ghgi_year) + nei_proxy_data = (nei_proxy_data[['year', 'year_month', 'state_code', 'rel_emi', 'geometry']] + .reset_index(drop=True) + ) + return nei_proxy_data diff --git a/gch4i/proxy_processing/task_enverus_di_prism_data_processing.py b/gch4i/proxy_processing/task_enverus_di_prism_data_processing.py new file mode 100644 index 0000000..de58eeb --- /dev/null +++ b/gch4i/proxy_processing/task_enverus_di_prism_data_processing.py @@ -0,0 +1,305 @@ +# %% +from pathlib import Path +import os +from typing import Annotated + +from pyarrow import parquet +import pandas as pd +import geopandas as gpd +import numpy as np +from pytask import Product, task, mark + +from gch4i.config import ( + global_data_dir_path, + sector_data_dir_path, + min_year, + max_year, + years, +) + +# %% +@mark.persist +@task(id="enverus_di_prism_data_processing") +def task_get_enverus_di_prism_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Well and Production Data (from Enverus) + # Read In & Combine Each Year of Prism & DI Monthly Data (from Enverus) + + # Data come from Enverus, both Drilling Info and Prism + # The reason 2 datasets are used is because Prism does not include all states + # So remaining states, or those with more DI coverage are taken from DI + + # Read In and Format the Prism and DI data + # 1. Read Data + # 2. Drop unused columns, rename columns to match between DI and Prism + # 3. Combine DI and Prism into one data array + # 4. Calculate annual cummulate production totals + # 5. Save the data as a year-specific variable + + # Based on ERGs logic, active wells are determined based on their production levels and not producing status + Enverus_data_dict = {} + DI_data_dict = {} + Prism_data_dict = {} + for iyear in years: + #DI data + DI_file_name = f"didsk_monthly_{iyear}.csv" + DI_file_path = os.path.join(enverus_production_path, DI_file_name) + DI_data = (pd.read_csv( + DI_file_path, + usecols=['WELL_COUNT_ID','STATE','COUNTY','BASIN','AAPG_CODE_ERG', + 'LATITUDE','LONGITUDE','STATUS','COMPDATE', + 'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR', + 'GOR_QUAL','PROD_FLAG','PRODYEAR', + 'LIQ_01','GAS_01','WTR_01','LIQ_02','GAS_02','WTR_02', + 'LIQ_03','GAS_03','WTR_03','LIQ_04','GAS_04','WTR_04', + 'LIQ_05','GAS_05','WTR_05','LIQ_06','GAS_06','WTR_06', + 'LIQ_07','GAS_07','WTR_07','LIQ_08','GAS_08','WTR_08', + 'LIQ_09','GAS_09','WTR_09','LIQ_10','GAS_10','WTR_10', + 'LIQ_11','GAS_11','WTR_11','LIQ_12','GAS_12','WTR_12',], + dtype={7:'str'}) + .rename(columns={'WELL_COUNT_ID':'WELL_COUNT','STATE':'STATE_CODE', + 'STATUS':'PRODUCING_STATUS', + 'LIQ_01':'OILPROD_01','GAS_01':'GASPROD_01','WTR_01':'WATERPROD_01', + 'LIQ_02':'OILPROD_02','GAS_02':'GASPROD_02','WTR_02':'WATERPROD_02', + 'LIQ_03':'OILPROD_03','GAS_03':'GASPROD_03','WTR_03':'WATERPROD_03', + 'LIQ_04':'OILPROD_04','GAS_04':'GASPROD_04','WTR_04':'WATERPROD_04', + 'LIQ_05':'OILPROD_05','GAS_05':'GASPROD_05','WTR_05':'WATERPROD_05', + 'LIQ_06':'OILPROD_06','GAS_06':'GASPROD_06','WTR_06':'WATERPROD_06', + 'LIQ_07':'OILPROD_07','GAS_07':'GASPROD_07','WTR_07':'WATERPROD_07', + 'LIQ_08':'OILPROD_08','GAS_08':'GASPROD_08','WTR_08':'WATERPROD_08', + 'LIQ_09':'OILPROD_09','GAS_09':'GASPROD_09','WTR_09':'WATERPROD_09', + 'LIQ_10':'OILPROD_10','GAS_10':'GASPROD_10','WTR_10':'WATERPROD_10', + 'LIQ_11':'OILPROD_11','GAS_11':'GASPROD_11','WTR_11':'WATERPROD_11', + 'LIQ_12':'OILPROD_12','GAS_12':'GASPROD_12','WTR_12':'WATERPROD_12',}) + .assign(WELL_COUNT=1) # TODO: Check to see if this should actually be set to 1 + ) + # Format completion date (YYYY-MM) + for iwell in range(0,len(DI_data)): + comp_date = str(DI_data.loc[iwell, 'COMPDATE']) + if comp_date == 'NaN': + comp_year_month = 'NaN' + elif comp_date == 'nan': + comp_year_month = 'NaN' + else: # date format M/DD/YYYY + comp_month = f"{int(comp_date.split('/')[0]):02}" + comp_year = f"{int(comp_date.split('/')[2])}" + comp_year_month = str(comp_year)+'-'+str(comp_month) + DI_data.loc[iwell, 'comp_year_month'] = comp_year_month + # Format spud date (YYYY) + for iwell in range(0,len(DI_data)): + spud_date = str(DI_data.loc[iwell, 'SPUDDATE']) + if spud_date == 'NaN': + spud_year = 'NaN' + elif spud_date == 'nan': + spud_year = 'NaN' + else: # date format M/DD/YYYY + spud_year = f"{int(spud_date.split('/')[2])}" + spud_year = str(spud_year) + DI_data.loc[iwell, 'spud_year'] = spud_year + # Format first production date (YYYY) + for iwell in range(0,len(DI_data)): + first_prod_date = str(DI_data.loc[iwell, 'FIRSTPRODDATE']) + if first_prod_date == 'NaN': + first_prod_year = 'NaN' + elif first_prod_date == 'nan': + first_prod_year = 'NaN' + else: # date format M/DD/YYYY + first_prod_year = f"{int(first_prod_date.split('/')[2])}" + first_prod_year = str(first_prod_year) + DI_data.loc[iwell, 'first_prod_year'] = first_prod_year + DI_data_dict[f'{iyear}'] = DI_data + + # Prism Data + Prism_file_name = f"prism_monthly_{iyear}.csv" + Prism_file_path = os.path.join(enverus_production_path, Prism_file_name) + Prism_data = (pd.read_csv( + Prism_file_path, + usecols=['STATE','COUNTY','ENVBASIN','AAPG_CODE_ERG', + 'LATITUDE','LONGITUDE','ENVWELLSTATUS','COMPLETIONDATE', + 'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR', + 'GOR_QUAL','PROD_FLAG','PRODYEAR', + 'LIQUIDSPROD_BBL_01','GASPROD_MCF_01','WATERPROD_BBL_01', + 'LIQUIDSPROD_BBL_02','GASPROD_MCF_02','WATERPROD_BBL_02', + 'LIQUIDSPROD_BBL_03','GASPROD_MCF_03','WATERPROD_BBL_03', + 'LIQUIDSPROD_BBL_04','GASPROD_MCF_04','WATERPROD_BBL_04', + 'LIQUIDSPROD_BBL_05','GASPROD_MCF_05','WATERPROD_BBL_05', + 'LIQUIDSPROD_BBL_06','GASPROD_MCF_06','WATERPROD_BBL_06', + 'LIQUIDSPROD_BBL_07','GASPROD_MCF_07','WATERPROD_BBL_07', + 'LIQUIDSPROD_BBL_08','GASPROD_MCF_08','WATERPROD_BBL_08', + 'LIQUIDSPROD_BBL_09','GASPROD_MCF_09','WATERPROD_BBL_09', + 'LIQUIDSPROD_BBL_10','GASPROD_MCF_10','WATERPROD_BBL_10', + 'LIQUIDSPROD_BBL_11','GASPROD_MCF_11','WATERPROD_BBL_11', + 'LIQUIDSPROD_BBL_12','GASPROD_MCF_12','WATERPROD_BBL_12',], + dtype={7:'str'}) + .rename(columns={'STATE':'STATE_CODE', 'ENVBASIN':'BASIN', + 'ENVWELLSTATUS':'PRODUCING_STATUS', + 'COMPLETIONDATE':'COMPDATE', + 'LIQUIDSPROD_BBL_01':'OILPROD_01','GASPROD_MCF_01':'GASPROD_01','WATERPROD_BBL_01':'WATERPROD_01', + 'LIQUIDSPROD_BBL_02':'OILPROD_02','GASPROD_MCF_02':'GASPROD_02','WATERPROD_BBL_02':'WATERPROD_02', + 'LIQUIDSPROD_BBL_03':'OILPROD_03','GASPROD_MCF_03':'GASPROD_03','WATERPROD_BBL_03':'WATERPROD_03', + 'LIQUIDSPROD_BBL_04':'OILPROD_04','GASPROD_MCF_04':'GASPROD_04','WATERPROD_BBL_04':'WATERPROD_04', + 'LIQUIDSPROD_BBL_05':'OILPROD_05','GASPROD_MCF_05':'GASPROD_05','WATERPROD_BBL_05':'WATERPROD_05', + 'LIQUIDSPROD_BBL_06':'OILPROD_06','GASPROD_MCF_06':'GASPROD_06','WATERPROD_BBL_06':'WATERPROD_06', + 'LIQUIDSPROD_BBL_07':'OILPROD_07','GASPROD_MCF_07':'GASPROD_07','WATERPROD_BBL_07':'WATERPROD_07', + 'LIQUIDSPROD_BBL_08':'OILPROD_08','GASPROD_MCF_08':'GASPROD_08','WATERPROD_BBL_08':'WATERPROD_08', + 'LIQUIDSPROD_BBL_09':'OILPROD_09','GASPROD_MCF_09':'GASPROD_09','WATERPROD_BBL_09':'WATERPROD_09', + 'LIQUIDSPROD_BBL_10':'OILPROD_10','GASPROD_MCF_10':'GASPROD_10','WATERPROD_BBL_10':'WATERPROD_10', + 'LIQUIDSPROD_BBL_11':'OILPROD_11','GASPROD_MCF_11':'GASPROD_11','WATERPROD_BBL_11':'WATERPROD_11', + 'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12',}) + .assign(WELL_COUNT=1) + ) + # Format completion date (YYYY-MM) + for iwell in range(0,len(Prism_data)): + comp_date = str(Prism_data.loc[iwell, 'COMPDATE']) + if comp_date == 'NaN': + comp_year_month = 'NaN' + elif comp_date == 'nan': + comp_year_month = 'NaN' + else: # date format YYYY-MM-DD + comp_month = f"{int(comp_date.split('-')[1]):02}" + comp_year = f"{int(comp_date.split('-')[0])}" + comp_year_month = str(comp_year)+'-'+str(comp_month) + Prism_data.loc[iwell, 'comp_year_month'] = comp_year_month + # Format spud date (YYYY) + for iwell in range(0,len(Prism_data)): + spud_date = str(Prism_data.loc[iwell, 'SPUDDATE']) + if spud_date == 'NaN': + spud_year = 'NaN' + elif spud_date == 'nan': + spud_year = 'NaN' + else: # date format YYYY-MM-DD + spud_year = f"{int(spud_date.split('-')[0])}" + spud_year = str(spud_year) + Prism_data.loc[iwell, 'spud_year'] = spud_year + # Format first production date (YYYY) + for iwell in range(0,len(Prism_data)): + first_prod_date = str(Prism_data.loc[iwell, 'FIRSTPRODDATE']) + if first_prod_date == 'NaN': + first_prod_year = 'NaN' + elif first_prod_date == 'nan': + first_prod_year = 'NaN' + else: # date format YYYY-MM-DD + first_prod_year = f"{int(first_prod_date.split('-')[0])}" + first_prod_year = str(first_prod_year) + Prism_data.loc[iwell, 'first_prod_year'] = first_prod_year + Prism_data_dict[f'{iyear}'] = Prism_data + + # Combine into one array with common column names, replace nans with zeros, and sum annual production + Enverus_data = pd.concat([DI_data, Prism_data], ignore_index=True) + Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')].fillna(0) + Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')].fillna(0) + Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')].fillna(0) + + # Calculate cummulative annual production totals for Gas, Oil, Water + Enverus_data['CUM_GAS'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('GASPROD_')].sum(1) + Enverus_data['CUM_OIL'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('OILPROD_')].sum(1) + Enverus_data['CUM_WATER'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('WATERPROD_')].sum(1) + + # Save out the data for that year + Enverus_data_dict[f'{iyear}'] = Enverus_data + + del Prism_data + del DI_data #save memory space + + # Correct Enverus Data for Select States + + # 1) Read In Coverage Table from State Well Counts File from ERG + # (specifies the first year with bad data and which years need to be corrected; + # all years including and after the first bad year of data need to be corrected) + + ERG_StateWellCounts_LastGoodDataYear = (pd.read_excel( + enverus_well_counts_path, + sheet_name = "2021 - Coverage", + usecols = {"State","Last Good Year"}, + skiprows = 2, + nrows = 40) + ) + + # 2) Loops through the each state and year in Enverus to determine if the data for that particualar year needs to + # be corrected. At the moment, the only corrections ERG makes to the data is to use the prior year of data if there + # is no new Enverus data reportd for that state. If a particular state is not included for any years in the Enverus + # dataset, then a row of zeros is added to the Enverus table for that year. + + for istate in np.arange(0,len(state_gdf)): + correctdata =0 + istate_code = state_gdf['state_code'][istate] + lastgoodyear = ERG_StateWellCounts_LastGoodDataYear['Last Good Year'][ERG_StateWellCounts_LastGoodDataYear['State'] == istate_code].values + if lastgoodyear == max_year: + lastgoodyear = max_year+5 #if state isn't included in correction list, don't correct any data + + for iyear in years: + enverus_data_temp= Enverus_data_dict[f'{iyear}'].copy() + state_list = np.unique(enverus_data_temp['STATE_CODE']) + if istate_code in state_list: + inlist =1 + else: + inlist = 0 + if inlist ==1 or correctdata==1: #if the state is included in Enverus data, or had data for at least one good year + #if first year, correctdata will be zero, but inlist will also be zero if no Enverus data + #check to see whether corrections are necessary for the given year/state + if iyear == (lastgoodyear): + print(istate_code,iyear,'last good year') + # This is the last year of good data. Do not correct the data but save + # but so that this data can be used for all following years for that state + temp_data = enverus_data_temp[enverus_data_temp['STATE_CODE'] == istate_code] + correctdata=1 + elif iyear > lastgoodyear: + print(istate_code,iyear) + # correct data for all years equal to and after the first bad year (remove old data first if necessary) + if inlist == 1: + enverus_data_temp = enverus_data_temp[enverus_data_temp['STATE_CODE'] != istate_code] + enverus_data_temp = pd.concat([enverus_data_temp,temp_data],ignore_index=True) + print(istate_code +' data for ' +str(iyear) +' were corrected with '+str(lastgoodyear)+' data') + else: + no_corrections =1 + + if inlist==0 and correctdata==0: + # if there is no Enverus data for a given state, and there was no good data, add a row with default values + print(istate_code +' has no Enverus data in the year ' +str(iyear)) + + # save that year of Enverus data + enverus_data_temp.reset_index(drop=True,inplace=True) + Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy() + tempoutput_filename = f'formatted_raw_enverus_tempoutput_{iyear}.csv' + tempoutput_filepath = os.path.join(intermediate_outputs_path, tempoutput_filename) + enverus_data_temp.to_csv(tempoutput_filepath, index=False) + + return None diff --git a/gch4i/proxy_processing/task_ng_all_well_count_proxy.py b/gch4i/proxy_processing/task_ng_all_well_count_proxy.py new file mode 100644 index 0000000..c44f9fd --- /dev/null +++ b/gch4i/proxy_processing/task_ng_all_well_count_proxy.py @@ -0,0 +1,175 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + ng_well_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="ng_all_well_count_proxy") +def task_get_ng_all_well_count_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + all_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_count_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + all_well_count_df = pd.DataFrame() # Active gas well (conventional + HF) counts in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # All Gas Well Count + all_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + all_well_count_df = pd.concat([all_well_count_df,all_well_count_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del all_well_count_imonth + + # Calculate relative emissions and convert to a geodataframe + all_well_count_df = calc_enverus_rel_emi(all_well_count_df) + all_well_count_df = enverus_df_to_gdf(all_well_count_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, ng_well_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + all_well_count_df = pd.concat([all_well_count_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = all_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + all_well_count_df = all_well_count_df.astype({'year':str}) + all_well_count_df.to_parquet(all_well_count_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_all_well_prod_proxy.py b/gch4i/proxy_processing/task_ng_all_well_prod_proxy.py new file mode 100644 index 0000000..58273a9 --- /dev/null +++ b/gch4i/proxy_processing/task_ng_all_well_prod_proxy.py @@ -0,0 +1,176 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + ng_gas_prod_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="ng_all_well_prod_proxy") +def task_get_ng_all_well_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + all_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + all_well_prod_df = pd.DataFrame() # Active gas well (conventional + HF) gas production in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # All Gas Well Gas Production + all_well_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]] + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str]) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + all_well_prod_df = pd.concat([all_well_prod_df,all_well_prod_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del all_well_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + all_well_prod_df = calc_enverus_rel_emi(all_well_prod_df) + all_well_prod_df = enverus_df_to_gdf(all_well_prod_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Gas Production + ifile_name = get_nei_file_name(nei_data_year, ng_gas_prod_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + all_well_prod_df = pd.concat([all_well_prod_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = all_well_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + all_well_prod_df = all_well_prod_df.astype({'year':str}) + all_well_prod_df.to_parquet(all_well_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_basin_220_prod_proxy.py b/gch4i/proxy_processing/task_ng_basin_220_prod_proxy.py new file mode 100644 index 0000000..fc4020c --- /dev/null +++ b/gch4i/proxy_processing/task_ng_basin_220_prod_proxy.py @@ -0,0 +1,153 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, +) + +# %% +@mark.persist +@task(id="ng_basin_220_prod_proxy") +def task_get_ng_basin_220_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + basin_220_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_220_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + basin_220_prod_df = pd.DataFrame() # Gas well gas production in Basin 220 in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Basin 220 Gas Well Gas Production + basin_220_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] + .query("AAPG_CODE_ERG == '220'") + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_220_prod_df = pd.concat([basin_220_prod_df,basin_220_prod_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del basin_220_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + basin_220_prod_df = calc_enverus_rel_emi(basin_220_prod_df) + basin_220_prod_df = enverus_df_to_gdf(basin_220_prod_df) + + # NEI Data: + # No addition of NEI data because IL and IN are not in this basin. We are adding + # them to the "other" basin. + + # Check that relative emissions sum to 1.0 each state/year combination + sums = basin_220_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + basin_220_prod_df = basin_220_prod_df.astype({'year':str}) + basin_220_prod_df.to_parquet(basin_220_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_basin_395_prod_proxy.py b/gch4i/proxy_processing/task_ng_basin_395_prod_proxy.py new file mode 100644 index 0000000..67a9c39 --- /dev/null +++ b/gch4i/proxy_processing/task_ng_basin_395_prod_proxy.py @@ -0,0 +1,153 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, +) + +# %% +@mark.persist +@task(id="ng_basin_395_prod_proxy") +def task_get_ng_basin_395_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + basin_395_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_395_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + basin_395_prod_df = pd.DataFrame() # Gas well gas production in Basin 395 in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Basin 395 Gas Well Gas Production + basin_395_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] + .query("AAPG_CODE_ERG == '395'") + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_395_prod_df = pd.concat([basin_395_prod_df,basin_395_prod_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del basin_395_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + basin_395_prod_df = calc_enverus_rel_emi(basin_395_prod_df) + basin_395_prod_df = enverus_df_to_gdf(basin_395_prod_df) + + # NEI Data: + # No addition of NEI data because IL and IN are not in this basin. We are adding + # them to the "other" basin. + + # Check that relative emissions sum to 1.0 each state/year combination + sums = basin_395_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + basin_395_prod_df = basin_395_prod_df.astype({'year':str}) + basin_395_prod_df.to_parquet(basin_395_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_basin_430_prod_proxy.py b/gch4i/proxy_processing/task_ng_basin_430_prod_proxy.py new file mode 100644 index 0000000..f8d082a --- /dev/null +++ b/gch4i/proxy_processing/task_ng_basin_430_prod_proxy.py @@ -0,0 +1,153 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, +) + +# %% +@mark.persist +@task(id="ng_basin_430_prod_proxy") +def task_get_ng_basin_430_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + basin_430_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_430_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + basin_430_prod_df = pd.DataFrame() # Gas well gas production in Basin 430 in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Basin 430 Gas Well Gas Production + basin_430_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] + .query("AAPG_CODE_ERG == '430'") + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_430_prod_df = pd.concat([basin_430_prod_df,basin_430_prod_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del basin_430_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + basin_430_prod_df = calc_enverus_rel_emi(basin_430_prod_df) + basin_430_prod_df = enverus_df_to_gdf(basin_430_prod_df) + + # NEI Data: + # No addition of NEI data because IL and IN are not in this basin. We are adding + # them to the "other" basin. + + # Check that relative emissions sum to 1.0 each state/year combination + sums = basin_430_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + basin_430_prod_df = basin_430_prod_df.astype({'year':str}) + basin_430_prod_df.to_parquet(basin_430_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_basin_other_prod_proxy.py b/gch4i/proxy_processing/task_ng_basin_other_prod_proxy.py new file mode 100644 index 0000000..2bb122d --- /dev/null +++ b/gch4i/proxy_processing/task_ng_basin_other_prod_proxy.py @@ -0,0 +1,177 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + ng_gas_prod_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="ng_basin_other_prod_proxy") +def task_get_ng_basin_other_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + basin_other_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_other_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + basin_other_prod_df = pd.DataFrame() # Gas well gas production in Other Basins in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # "Other" Basin Gas Production + basin_other_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] + .query("AAPG_CODE_ERG != '220' & AAPG_CODE_ERG != '395' & AAPG_CODE_ERG != '430'") + .assign(proxy_data=lambda df: df[gas_prod_str]) + .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_other_prod_df = pd.concat([basin_other_prod_df,basin_other_prod_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del basin_other_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + basin_other_prod_df = calc_enverus_rel_emi(basin_other_prod_df) + basin_other_prod_df = enverus_df_to_gdf(basin_other_prod_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Gas Production + ifile_name = get_nei_file_name(nei_data_year, ng_gas_prod_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + basin_other_prod_df = pd.concat([basin_other_prod_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = basin_other_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + basin_other_prod_df = basin_other_prod_df.astype({'year':str}) + basin_other_prod_df.to_parquet(basin_other_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_conv_well_comp_proxy.py b/gch4i/proxy_processing/task_ng_conv_well_comp_proxy.py new file mode 100644 index 0000000..78f41e5 --- /dev/null +++ b/gch4i/proxy_processing/task_ng_conv_well_comp_proxy.py @@ -0,0 +1,179 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + ng_comp_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="ng_conv_well_comp_proxy") +def task_get_ng_conv_well_comp_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + conv_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_comp_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + conv_well_comp_df = pd.DataFrame() # Conventional well completions + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str, 'comp_year_month': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str, "comp_year_month": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Conventional Well Completions + conv_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']] + .query("HF != 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .query(f"comp_year_month == '{year_month_str}'") + .drop(columns=["comp_year_month"]) + .reset_index(drop=True) + ) + conv_well_comp_df = pd.concat([conv_well_comp_df,conv_well_comp_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del conv_well_comp_imonth + + # Calculate relative emissions and convert to a geodataframe + conv_well_comp_df = calc_enverus_rel_emi(conv_well_comp_df ) + conv_well_comp_df = enverus_df_to_gdf(conv_well_comp_df ) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, ng_comp_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + conv_well_comp_df = pd.concat([conv_well_comp_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = conv_well_comp_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + conv_well_comp_df = conv_well_comp_df.astype({'year':str}) + conv_well_comp_df.to_parquet(conv_well_comp_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_conv_well_count_proxy.py b/gch4i/proxy_processing/task_ng_conv_well_count_proxy.py new file mode 100644 index 0000000..bc8c4d4 --- /dev/null +++ b/gch4i/proxy_processing/task_ng_conv_well_count_proxy.py @@ -0,0 +1,176 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + ng_well_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="ng_conv_well_count_proxy") +def task_get_ng_conv_well_count_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + conv_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_count_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + conv_well_count_df = pd.DataFrame() # Active conventional gas well counts in a given month + + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Conventional Gas Well Count + conv_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']] + .query("HF != 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + conv_well_count_df = pd.concat([conv_well_count_df,conv_well_count_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del conv_well_count_imonth + + # Calculate relative emissions and convert to a geodataframe + conv_well_count_df = calc_enverus_rel_emi(conv_well_count_df) + conv_well_count_df = enverus_df_to_gdf(conv_well_count_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, ng_well_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + conv_well_count_df = pd.concat([conv_well_count_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = conv_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + conv_well_count_df = conv_well_count_df.astype({'year':str}) + conv_well_count_df.to_parquet(conv_well_count_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_drilled_well_proxy.py b/gch4i/proxy_processing/task_ng_drilled_well_proxy.py new file mode 100644 index 0000000..c85236c --- /dev/null +++ b/gch4i/proxy_processing/task_ng_drilled_well_proxy.py @@ -0,0 +1,180 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + ng_spud_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="ng_drilled_well_proxy") +def task_get_ng_drilled_well_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + drilled_well_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_drilled_well_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + drilled_well_df = pd.DataFrame() # Gas wells drilled + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Drilled Gas Wells + drilled_well_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','spud_year','first_prod_year']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + # wells with a spud date or first production date in the current year + .query(f"spud_year == '{iyear}' | first_prod_year == '{iyear}'") + # wells with a spud_year == iyear or if no spud date, first_prod_year == iyear + .query(f"spud_year == '{iyear}' | spud_year == 'NaN' | spud_year == 'nan'") + .drop(columns=['hf', 'spud_year', 'first_prod_year']) + .reset_index(drop=True) + ) + drilled_well_df = pd.concat([drilled_well_df,drilled_well_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del drilled_well_imonth + + # Calculate relative emissions and convert to a geodataframe + drilled_well_df = calc_enverus_rel_emi(drilled_well_df) + drilled_well_df = enverus_df_to_gdf(drilled_well_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, ng_spud_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + drilled_well_df = pd.concat([drilled_well_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = drilled_well_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + drilled_well_df = drilled_well_df.astype({'year':str}) + drilled_well_df.to_parquet(drilled_well_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_hf_well_comp_proxy.py b/gch4i/proxy_processing/task_ng_hf_well_comp_proxy.py new file mode 100644 index 0000000..7580554 --- /dev/null +++ b/gch4i/proxy_processing/task_ng_hf_well_comp_proxy.py @@ -0,0 +1,179 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + ng_comp_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="ng_hf_well_comp_proxy") +def task_get_ng_hf_well_comp_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + hf_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_comp_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + hf_well_comp_df = pd.DataFrame() # HF well completions + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str, 'comp_year_month': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str, "comp_year_month": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # HF Well Completions + hf_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']] + .query("HF == 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .query(f"comp_year_month == '{year_month_str}'") + .drop(columns=["comp_year_month"]) + .reset_index(drop=True) + ) + hf_well_comp_df = pd.concat([hf_well_comp_df,hf_well_comp_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del hf_well_comp_imonth + + # Calculate relative emissions and convert to a geodataframe + hf_well_comp_df = calc_enverus_rel_emi(hf_well_comp_df ) + hf_well_comp_df = enverus_df_to_gdf(hf_well_comp_df ) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, ng_comp_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + hf_well_comp_df = pd.concat([hf_well_comp_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = hf_well_comp_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + hf_well_comp_df = hf_well_comp_df.astype({'year':str}) + hf_well_comp_df.to_parquet(hf_well_comp_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_hf_well_count_proxy.py b/gch4i/proxy_processing/task_ng_hf_well_count_proxy.py new file mode 100644 index 0000000..a29d6b8 --- /dev/null +++ b/gch4i/proxy_processing/task_ng_hf_well_count_proxy.py @@ -0,0 +1,177 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + ng_well_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="ng_hf_well_count_proxy") +def task_get_ng_hf_well_count_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + hf_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_count_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + hf_well_count_df = pd.DataFrame() # Active HF gas well counts in a given month + + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # HF Gas Well Count + hf_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']] + .query("HF == 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + hf_well_count_df = pd.concat([hf_well_count_df,hf_well_count_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del hf_well_count_imonth + + # Calculate relative emissions and convert to a geodataframe + hf_well_count_df = calc_enverus_rel_emi(hf_well_count_df) + hf_well_count_df = enverus_df_to_gdf(hf_well_count_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, ng_well_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + hf_well_count_df = pd.concat([hf_well_count_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = hf_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + hf_well_count_df = hf_well_count_df.astype({'year':str}) + hf_well_count_df.to_parquet(hf_well_count_output_path) + + return None + diff --git a/gch4i/proxy_processing/federal_gom_offshore_proxy.py b/gch4i/proxy_processing/task_ng_oil_federal_gom_offshore_proxy.py similarity index 99% rename from gch4i/proxy_processing/federal_gom_offshore_proxy.py rename to gch4i/proxy_processing/task_ng_oil_federal_gom_offshore_proxy.py index 31468c8..0a34c88 100644 --- a/gch4i/proxy_processing/federal_gom_offshore_proxy.py +++ b/gch4i/proxy_processing/task_ng_oil_federal_gom_offshore_proxy.py @@ -29,8 +29,8 @@ # %% @mark.persist -@task(id="federal_gom_offshore_proxy") -def task_get_federal_gom_offshore_proxy_data( +@task(id="ng_oil_federal_gom_offshore_proxy") +def task_get_ng_oil_federal_gom_offshore_proxy_data( state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", GOADS_11_path: Path = sector_data_dir_path / "boem" / "2011_Gulfwide_Platform_Inventory.accdb", GOADS_14_path: Path = sector_data_dir_path / "boem" / "2014_Gulfwide_Platform_Inventory.accdb", diff --git a/gch4i/proxy_processing/task_ng_oil_state_gom_offshore_proxy.py b/gch4i/proxy_processing/task_ng_oil_state_gom_offshore_proxy.py new file mode 100644 index 0000000..ddf0923 --- /dev/null +++ b/gch4i/proxy_processing/task_ng_oil_state_gom_offshore_proxy.py @@ -0,0 +1,278 @@ +# %% +from pathlib import Path +import os +from typing import Annotated + +from pyarrow import parquet +import pandas as pd +import geopandas as gpd +import numpy as np +from pytask import Product, task, mark + +from gch4i.config import ( + global_data_dir_path, + sector_data_dir_path, + proxy_data_dir_path, + min_year, + max_year, + years, +) +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, +) + +# %% +@mark.persist +@task(id="ng_oil_state_gom_offshore_proxy") +def task_get_ng_oil_state_gom_offshore_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + oil_state_gom_offshore_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_state_gom_offshore_well_count_proxy.parquet", + oil_pac_fed_state_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_pac_fed_state_proxy.parquet", + ng_state_gom_offshore_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_count_proxy.parquet", + ): + """ + Data come from Enverus, Prism only. Drilling Info data is not used for the offshore + well data because DI was only used for KS, MD, MI, MO, OK, and TN which are not + in the offshore region of the U.S. + + States to produce offshore data: AL, CA, CAO (California Offshore), FL, LA, MS, TX + Note that there is no Enverus Prism data for FL and MS. + + """ + + # Well and Production Data (from Enverus) + # Read In and Format DI data + # 1. Read Data + # 2. Drop unused columns, rename columns + # 3. Calculate annual cummulate production totals + # 4. Save the data as a year-specific variable + + # Based on ERGs logic, active wells are determined based on their production levels and not producing status + Enverus_data_dict = {} + + for iyear in years: + # Prism Data + Prism_file_name = f"prism_monthly_wells_offshore_{iyear}.xlsx" + Prism_file_path = os.path.join(enverus_production_path, Prism_file_name) + Prism_data = (pd.read_excel( + Prism_file_path, + usecols={'STATE', 'LATITUDE', 'LONGITUDE', 'OFFSHORE', 'GOR_QUAL', + 'LIQUIDSPROD_BBL_01', 'GASPROD_MCF_01', 'WATERPROD_BBL_01', + 'LIQUIDSPROD_BBL_02', 'GASPROD_MCF_02', 'WATERPROD_BBL_02', + 'LIQUIDSPROD_BBL_03', 'GASPROD_MCF_03', 'WATERPROD_BBL_03', + 'LIQUIDSPROD_BBL_04', 'GASPROD_MCF_04', 'WATERPROD_BBL_04', + 'LIQUIDSPROD_BBL_05', 'GASPROD_MCF_05', 'WATERPROD_BBL_05', + 'LIQUIDSPROD_BBL_06', 'GASPROD_MCF_06', 'WATERPROD_BBL_06', + 'LIQUIDSPROD_BBL_07', 'GASPROD_MCF_07', 'WATERPROD_BBL_07', + 'LIQUIDSPROD_BBL_08', 'GASPROD_MCF_08', 'WATERPROD_BBL_08', + 'LIQUIDSPROD_BBL_09', 'GASPROD_MCF_09', 'WATERPROD_BBL_09', + 'LIQUIDSPROD_BBL_10', 'GASPROD_MCF_10', 'WATERPROD_BBL_10', + 'LIQUIDSPROD_BBL_11', 'GASPROD_MCF_11', 'WATERPROD_BBL_11', + 'LIQUIDSPROD_BBL_12', 'GASPROD_MCF_12', 'WATERPROD_BBL_12', + }) + .rename(columns={'STATE':'STATE_CODE', + 'LIQUIDSPROD_BBL_01':'OILPROD_01','GASPROD_MCF_01':'GASPROD_01','WATERPROD_BBL_01':'WATERPROD_01', + 'LIQUIDSPROD_BBL_02':'OILPROD_02','GASPROD_MCF_02':'GASPROD_02','WATERPROD_BBL_02':'WATERPROD_02', + 'LIQUIDSPROD_BBL_03':'OILPROD_03','GASPROD_MCF_03':'GASPROD_03','WATERPROD_BBL_03':'WATERPROD_03', + 'LIQUIDSPROD_BBL_04':'OILPROD_04','GASPROD_MCF_04':'GASPROD_04','WATERPROD_BBL_04':'WATERPROD_04', + 'LIQUIDSPROD_BBL_05':'OILPROD_05','GASPROD_MCF_05':'GASPROD_05','WATERPROD_BBL_05':'WATERPROD_05', + 'LIQUIDSPROD_BBL_06':'OILPROD_06','GASPROD_MCF_06':'GASPROD_06','WATERPROD_BBL_06':'WATERPROD_06', + 'LIQUIDSPROD_BBL_07':'OILPROD_07','GASPROD_MCF_07':'GASPROD_07','WATERPROD_BBL_07':'WATERPROD_07', + 'LIQUIDSPROD_BBL_08':'OILPROD_08','GASPROD_MCF_08':'GASPROD_08','WATERPROD_BBL_08':'WATERPROD_08', + 'LIQUIDSPROD_BBL_09':'OILPROD_09','GASPROD_MCF_09':'GASPROD_09','WATERPROD_BBL_09':'WATERPROD_09', + 'LIQUIDSPROD_BBL_10':'OILPROD_10','GASPROD_MCF_10':'GASPROD_10','WATERPROD_BBL_10':'WATERPROD_10', + 'LIQUIDSPROD_BBL_11':'OILPROD_11','GASPROD_MCF_11':'GASPROD_11','WATERPROD_BBL_11':'WATERPROD_11', + 'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12', + }) + .assign(WELL_COUNT=1) + .query("OFFSHORE == 'Y'") + ) + + # Replace nans with zeros, and sum annual production + Prism_data.loc[:, Prism_data.columns.str.contains('GASPROD_')] = Prism_data.loc[:, Prism_data.columns.str.contains('GASPROD_')].fillna(0) + Prism_data.loc[:, Prism_data.columns.str.contains('OILPROD_')] = Prism_data.loc[:, Prism_data.columns.str.contains('OILPROD_')].fillna(0) + Prism_data.loc[:, Prism_data.columns.str.contains('WATERPROD_')] = Prism_data.loc[:, Prism_data.columns.str.contains('WATERPROD_')].fillna(0) + + # Calculate cummulative annual production totals for Gas, Oil, Water + Prism_data['CUM_GAS'] = Prism_data.loc[:,Prism_data.columns.str.contains('GASPROD_')].sum(1) + Prism_data['CUM_OIL'] = Prism_data.loc[:,Prism_data.columns.str.contains('OILPROD_')].sum(1) + Prism_data['CUM_WATER'] = Prism_data.loc[:,Prism_data.columns.str.contains('WATERPROD_')].sum(1) + + # Save out the data for that year + Enverus_data_dict[f'{iyear}'] = Prism_data + + del Prism_data + + # Correct Enverus Data for Select States + + # 1) Read In Coverage Table from State Well Counts File from ERG + # (specifies the first year with bad data and which years need to be corrected; + # all years including and after the first bad year of data need to be corrected) + + ERG_StateWellCounts_LastGoodDataYear = (pd.read_excel( + enverus_well_counts_path, + sheet_name = "2021 - Coverage", + usecols = {"State","Last Good Year"}, + skiprows = 2, + nrows = 40) + ) + + # 2) Loops through the each state and year in Enverus to determine if the data for that particualar year needs to + # be corrected. At the moment, the only corrections ERG makes to the data is to use the prior year of data if there + # is no new Enverus data reportd for that state. If a particular state is not included for any years in the Enverus + # dataset, then a row of zeros is added to the Enverus table for that year. + + offshore_states = ['AL', 'CAO', 'FL', 'LA', 'MS', 'TX'] + + for istate in np.arange(0,len(offshore_states)): + correctdata = 0 + istate_code = offshore_states[istate] + lastgoodyear = ERG_StateWellCounts_LastGoodDataYear['Last Good Year'][ERG_StateWellCounts_LastGoodDataYear['State'] == istate_code].values + if lastgoodyear == max_year: + lastgoodyear = max_year+5 #if state isn't included in correction list, don't correct any data + + for iyear in years: + enverus_data_temp= Enverus_data_dict[f'{iyear}'].copy() + state_list = np.unique(enverus_data_temp['STATE_CODE']) + if istate_code in state_list: + inlist =1 + else: + inlist = 0 + if inlist ==1 or correctdata==1: #if the state is included in Enverus data, or had data for at least one good year + #if first year, correctdata will be zero, but inlist will also be zero if no Enverus data + #check to see whether corrections are necessary for the given year/state + if iyear == (lastgoodyear): + print(istate_code,iyear,'last good year') + # This is the last year of good data. Do not correct the data but save + # but so that this data can be used for all following years for that state + temp_data = enverus_data_temp[enverus_data_temp['STATE_CODE'] == istate_code] + correctdata=1 + elif iyear > lastgoodyear: + print(istate_code,iyear) + # correct data for all years equal to and after the first bad year (remove old data first if necessary) + if inlist == 1: + enverus_data_temp = enverus_data_temp[enverus_data_temp['STATE_CODE'] != istate_code] + enverus_data_temp = pd.concat([enverus_data_temp,temp_data],ignore_index=True) + print(istate_code +' data for ' +str(iyear) +' were corrected with '+str(lastgoodyear)+' data') + else: + no_corrections =1 + + if inlist == 0 and correctdata == 0: + # if there is no Enverus data for a given state, and there was no good data, add a row with default values + print(istate_code + ' has no Enverus data in the year ' + str(iyear)) + + # save that year of Enverus data + enverus_data_temp.reset_index(drop=True, inplace=True) + Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy() + tempoutput_filename = f'formatted_raw_enverus_offshore_tempoutput_{iyear}.csv' + tempoutput_filepath = os.path.join(intermediate_outputs_path, tempoutput_filename) + enverus_data_temp.to_csv(tempoutput_filepath, index=False) + + # create proxy files + ng_state_gom_offshore_df = pd.DataFrame() + oil_state_gom_offshore_df = pd.DataFrame() + oil_pac_fed_state_df = pd.DataFrame() + + # ng proxy + for iyear in years: + ng_data_temp = (Enverus_data_dict[f'{iyear}'] + .query("STATE_CODE.isin(@offshore_states)") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1, 13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE', + 'WELL_COUNT', gas_prod_str,]] + ) + # State GOM Offshore Gas Well Count + ng_state_gom_offshore_imonth = (ng_data_imonth_temp[['year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE', 'WELL_COUNT']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count": "proxy_data"}) + .reset_index(drop=True) + ) + ng_state_gom_offshore_df = pd.concat([ng_state_gom_offshore_df, ng_state_gom_offshore_imonth]) + + # oil proxies + for iyear in years: + oil_data_temp = (Enverus_data_dict[f'{iyear}'] + .query("STATE_CODE.isin(@offshore_states)") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + # Include wells in map only for months where there is oil production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE', + 'WELL_COUNT', oil_prod_str,]] + ) + # State GOM Offshore Oil Well Count + oil_state_gom_offshore_imonth = (oil_data_imonth_temp[['year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE', 'WELL_COUNT']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count": "proxy_data"}) + .query("state_code != 'CAO'") + .reset_index(drop=True) + ) + oil_state_gom_offshore_df = pd.concat([oil_state_gom_offshore_df, oil_state_gom_offshore_imonth]) + # Pacific Federal State Offshore Oil Well Count + oil_pac_fed_state_imonth = (oil_data_imonth_temp[['year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE', 'WELL_COUNT']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count": "proxy_data"}) + .query("state_code == 'CAO'") + .reset_index(drop=True) + ) + oil_pac_fed_state_df = pd.concat([oil_pac_fed_state_df, oil_pac_fed_state_imonth]) + + # Calculate relative emissions and convert to a geodataframe + ng_state_gom_offshore_df = calc_enverus_rel_emi(ng_state_gom_offshore_df) + ng_state_gom_offshore_df = enverus_df_to_gdf(ng_state_gom_offshore_df) + oil_state_gom_offshore_df = calc_enverus_rel_emi(oil_state_gom_offshore_df) + oil_state_gom_offshore_df = enverus_df_to_gdf(oil_state_gom_offshore_df) + oil_pac_fed_state_df = calc_enverus_rel_emi(oil_pac_fed_state_df) + oil_pac_fed_state_df = enverus_df_to_gdf(oil_pac_fed_state_df) + + # Check that relative emissions sum to 1.0 each state/year combination + sums = ng_state_gom_offshore_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + sums = oil_state_gom_offshore_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + sums = oil_pac_fed_state_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + ng_state_gom_offshore_df = ng_state_gom_offshore_df.astype({'year':str}) + ng_state_gom_offshore_df.to_parquet(ng_state_gom_offshore_output_path) + oil_state_gom_offshore_df = oil_state_gom_offshore_df.astype({'year':str}) + oil_state_gom_offshore_df.to_parquet(oil_state_gom_offshore_output_path) + oil_pac_fed_state_df = oil_pac_fed_state_df.astype({'year':str}) + oil_pac_fed_state_df.to_parquet(oil_pac_fed_state_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_production_proxy.py b/gch4i/proxy_processing/task_ng_production_proxy.py deleted file mode 100644 index 09fff7d..0000000 --- a/gch4i/proxy_processing/task_ng_production_proxy.py +++ /dev/null @@ -1,1007 +0,0 @@ -# %% -from pathlib import Path -import os -from typing import Annotated -from zipfile import ZipFile -import calendar -import datetime - -from pyarrow import parquet -import pandas as pd -import osgeo -import geopandas as gpd -import numpy as np -import seaborn as sns -import shapefile as shp -from pytask import Product, task, mark - -from gch4i.config import ( - V3_DATA_PATH, - proxy_data_dir_path, - global_data_dir_path, - sector_data_dir_path, - max_year, - min_year, - years, -) - -from gch4i.utils import us_state_to_abbrev - -# %% -@mark.persist -@task(id="ng_production_proxy") -def task_get_ng_production_proxy_data( - state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", - nems_region_dict_path: Path = sector_data_dir_path / "enverus/NEMS_Region_Dictionary.xlsx", - enverus_production_path: Path = sector_data_dir_path / "enverus/production", - enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx", - nei_path: Path = sector_data_dir_path / "nei_og", - all_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_count_proxy.parquet", - conv_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_count_proxy.parquet", - hf_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_count_proxy.parquet", - all_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_prod_proxy.parquet", - basin_220_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_220_prod_proxy.parquet", - basin_395_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_395_prod_proxy.parquet", - basin_430_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_430_prod_proxy.parquet", - basin_other_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_other_prod_proxy.parquet", - water_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_water_prod_proxy.parquet", - conv_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_comp_proxy.parquet", - hf_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_comp_proxy.parquet", - drilled_well_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_drilled_well_proxy.parquet", - state_gom_offshore_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_count_proxy.parquet", - state_gom_offshore_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_prod_proxy.parquet", - ): - """ - Data come from Enverus, both Drilling Info and Prism - The reason 2 datasets are used is because Prism does not include all states - So remaining states, or those with more DI coverage are taken from DI - - DI: KS, MD, MI, MO, OK, TN - - Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, - NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, - SD, TX, UT, VA, WV, WY - - States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, - NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil - and gas production with an exception for IL and IN. - - *IL and IN do not report to Enverus, but do have oil and gas production. Production - data is taken from the Energy Information Administration (EIA). - - TODO: Update enverus_well_counts_path with v3 data (currently using v2 data) - """ - - # Functions: - # Define safe devide to set result to zero if denominator is zero - def safe_div(x,y): - if y == 0: - return 0 - return x / y - - # STEP 1: Load in State ANSI data and NEMS definitions - - state_gdf = ( - gpd.read_file(state_path) - .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] - .rename(columns=str.lower) - .rename(columns={"stusps": "state_code", "name": "state_name"}) - .astype({"statefp": int}) - # get only lower 48 + DC - .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") - .reset_index(drop=True) - .to_crs(4326) - ) - - # Make NEMS State classifications - # Treat NM and TX separately since these states cover multiple NEMS regions - - # 0 = NE, 1 = MC, 2 = RM, 3 = SW, 4 = WC, 5 = GC, 6 = offshore - NEMS_State = pd.read_excel(nems_region_dict_path) - NEMS_State = NEMS_State.fillna(0) - NM_idx = NEMS_State.index[NEMS_State['State_Name'].str.contains('New Mexico')].tolist() - TX_idx = NEMS_State.index[NEMS_State['State_Name'].str.contains('Texas')].tolist() - idx = NM_idx+TX_idx - NEMS_State= NEMS_State.drop(NEMS_State.index[idx]) - NEMS_State.reset_index(drop=True,inplace=True) - - NEMS_dict = {'North East':0, 'Midcontinent':1,'Rocky Mountain':2,'South West':3,'West Coast':4,'Gulf Coast':5} - - # STEP 2: Read-in and Format Proxy Data - - # STEP 2.1: State Condensate Data - - # TODO: state condensate data code - - # STEP 2.2: GOADS Emissions Data - - # TODO: GOADS emissions data code - - # STEP 2.3: Well and Production Data (from Enverus) - - # STEP 2.3.1: Read In & Combine Each Year of Prism & DI Monthly Data (from Enverus) - - # Data come from Enverus, both Drilling Info and Prism - # The reason 2 datasets are used is because Prism does not include all states - # So remaining states, or those with more DI coverage are taken from DI - - # Read In and Format the Prism and DI data - # 1. Read Data - # 2. Drop unsed columns, rename columns to match between DI and Prism - # 3. Combine DI and Prism into one data array - # 4. Calculate annual cummulate production totals - # 5. Save the data as a year-specific variable - - # Based on ERGs logic, active wells are determined based on their production levels and not producing status - Enverus_data_dict = {} - DI_data_dict = {} - Prism_data_dict = {} - for iyear in years: - #DI data - DI_file_name = f"didsk_monthly_{iyear}.csv" - DI_file_path = os.path.join(enverus_production_path, DI_file_name) - DI_data = (pd.read_csv( - DI_file_path, - usecols=['WELL_COUNT_ID','STATE','COUNTY','BASIN','AAPG_CODE_ERG', - 'NEMS_REGION_ERG','LATITUDE','LONGITUDE','STATUS','COMPDATE', - 'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR', - 'GOR_QUAL','PROD_FLAG','PRODYEAR', - 'LIQ_01','GAS_01','WTR_01','LIQ_02','GAS_02','WTR_02', - 'LIQ_03','GAS_03','WTR_03','LIQ_04','GAS_04','WTR_04', - 'LIQ_05','GAS_05','WTR_05','LIQ_06','GAS_06','WTR_06', - 'LIQ_07','GAS_07','WTR_07','LIQ_08','GAS_08','WTR_08', - 'LIQ_09','GAS_09','WTR_09','LIQ_10','GAS_10','WTR_10', - 'LIQ_11','GAS_11','WTR_11','LIQ_12','GAS_12','WTR_12',], - dtype={7:'str'}) - .rename(columns={'WELL_COUNT_ID':'WELL_COUNT','STATE':'STATE_CODE', - 'NEMS_REGION_ERG':'NEMS_REGION', 'STATUS':'PRODUCING_STATUS', - 'LIQ_01':'OILPROD_01','GAS_01':'GASPROD_01','WTR_01':'WATERPROD_01', - 'LIQ_02':'OILPROD_02','GAS_02':'GASPROD_02','WTR_02':'WATERPROD_02', - 'LIQ_03':'OILPROD_03','GAS_03':'GASPROD_03','WTR_03':'WATERPROD_03', - 'LIQ_04':'OILPROD_04','GAS_04':'GASPROD_04','WTR_04':'WATERPROD_04', - 'LIQ_05':'OILPROD_05','GAS_05':'GASPROD_05','WTR_05':'WATERPROD_05', - 'LIQ_06':'OILPROD_06','GAS_06':'GASPROD_06','WTR_06':'WATERPROD_06', - 'LIQ_07':'OILPROD_07','GAS_07':'GASPROD_07','WTR_07':'WATERPROD_07', - 'LIQ_08':'OILPROD_08','GAS_08':'GASPROD_08','WTR_08':'WATERPROD_08', - 'LIQ_09':'OILPROD_09','GAS_09':'GASPROD_09','WTR_09':'WATERPROD_09', - 'LIQ_10':'OILPROD_10','GAS_10':'GASPROD_10','WTR_10':'WATERPROD_10', - 'LIQ_11':'OILPROD_11','GAS_11':'GASPROD_11','WTR_11':'WATERPROD_11', - 'LIQ_12':'OILPROD_12','GAS_12':'GASPROD_12','WTR_12':'WATERPROD_12',}) - .assign(WELL_COUNT=1) # TODO: Check to see if this should actually be set to 1 - ) - # Format completion date (YYYY-MM) - for iwell in range(0,len(DI_data)): - comp_date = str(DI_data.loc[iwell, 'COMPDATE']) - if comp_date == 'NaN': - comp_year_month = 'NaN' - elif comp_date == 'nan': - comp_year_month = 'NaN' - else: # date format M/DD/YYYY - comp_month = f"{int(comp_date.split('/')[0]):02}" - comp_year = f"{int(comp_date.split('/')[2])}" - comp_year_month = str(comp_year)+'-'+str(comp_month) - DI_data.loc[iwell, 'comp_year_month'] = comp_year_month - # Format spud date (YYYY) - for iwell in range(0,len(DI_data)): - spud_date = str(DI_data.loc[iwell, 'SPUDDATE']) - if spud_date == 'NaN': - spud_year = 'NaN' - elif spud_date == 'nan': - spud_year = 'NaN' - else: # date format M/DD/YYYY - spud_year = f"{int(spud_date.split('/')[2])}" - spud_year = str(spud_year) - DI_data.loc[iwell, 'spud_year'] = spud_year - # Format first production date (YYYY) - for iwell in range(0,len(DI_data)): - first_prod_date = str(DI_data.loc[iwell, 'FIRSTPRODDATE']) - if first_prod_date == 'NaN': - first_prod_year = 'NaN' - elif first_prod_date == 'nan': - first_prod_year = 'NaN' - else: # date format M/DD/YYYY - first_prod_year = f"{int(first_prod_date.split('/')[2])}" - first_prod_year = str(first_prod_year) - DI_data.loc[iwell, 'first_prod_year'] = first_prod_year - DI_data_dict[f'{iyear}'] = DI_data - - # Prism Data - Prism_file_name = f"prism_monthly_{iyear}.csv" - Prism_file_path = os.path.join(enverus_production_path, Prism_file_name) - Prism_data = (pd.read_csv( - Prism_file_path, - usecols=['STATE','COUNTY','ENVBASIN','AAPG_CODE_ERG', - 'NEMS_REGION_ERG','LATITUDE','LONGITUDE','ENVWELLSTATUS','COMPLETIONDATE', - 'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR', - 'GOR_QUAL','PROD_FLAG','PRODYEAR', - 'LIQUIDSPROD_BBL_01','GASPROD_MCF_01','WATERPROD_BBL_01', - 'LIQUIDSPROD_BBL_02','GASPROD_MCF_02','WATERPROD_BBL_02', - 'LIQUIDSPROD_BBL_03','GASPROD_MCF_03','WATERPROD_BBL_03', - 'LIQUIDSPROD_BBL_04','GASPROD_MCF_04','WATERPROD_BBL_04', - 'LIQUIDSPROD_BBL_05','GASPROD_MCF_05','WATERPROD_BBL_05', - 'LIQUIDSPROD_BBL_06','GASPROD_MCF_06','WATERPROD_BBL_06', - 'LIQUIDSPROD_BBL_07','GASPROD_MCF_07','WATERPROD_BBL_07', - 'LIQUIDSPROD_BBL_08','GASPROD_MCF_08','WATERPROD_BBL_08', - 'LIQUIDSPROD_BBL_09','GASPROD_MCF_09','WATERPROD_BBL_09', - 'LIQUIDSPROD_BBL_10','GASPROD_MCF_10','WATERPROD_BBL_10', - 'LIQUIDSPROD_BBL_11','GASPROD_MCF_11','WATERPROD_BBL_11', - 'LIQUIDSPROD_BBL_12','GASPROD_MCF_12','WATERPROD_BBL_12',], - dtype={7:'str'}) - .rename(columns={'STATE':'STATE_CODE', 'ENVBASIN':'BASIN', - 'NEMS_REGION_ERG':'NEMS_REGION', 'ENVWELLSTATUS':'PRODUCING_STATUS', - 'COMPLETIONDATE':'COMPDATE', - 'LIQUIDSPROD_BBL_01':'OILPROD_01','GASPROD_MCF_01':'GASPROD_01','WATERPROD_BBL_01':'WATERPROD_01', - 'LIQUIDSPROD_BBL_02':'OILPROD_02','GASPROD_MCF_02':'GASPROD_02','WATERPROD_BBL_02':'WATERPROD_02', - 'LIQUIDSPROD_BBL_03':'OILPROD_03','GASPROD_MCF_03':'GASPROD_03','WATERPROD_BBL_03':'WATERPROD_03', - 'LIQUIDSPROD_BBL_04':'OILPROD_04','GASPROD_MCF_04':'GASPROD_04','WATERPROD_BBL_04':'WATERPROD_04', - 'LIQUIDSPROD_BBL_05':'OILPROD_05','GASPROD_MCF_05':'GASPROD_05','WATERPROD_BBL_05':'WATERPROD_05', - 'LIQUIDSPROD_BBL_06':'OILPROD_06','GASPROD_MCF_06':'GASPROD_06','WATERPROD_BBL_06':'WATERPROD_06', - 'LIQUIDSPROD_BBL_07':'OILPROD_07','GASPROD_MCF_07':'GASPROD_07','WATERPROD_BBL_07':'WATERPROD_07', - 'LIQUIDSPROD_BBL_08':'OILPROD_08','GASPROD_MCF_08':'GASPROD_08','WATERPROD_BBL_08':'WATERPROD_08', - 'LIQUIDSPROD_BBL_09':'OILPROD_09','GASPROD_MCF_09':'GASPROD_09','WATERPROD_BBL_09':'WATERPROD_09', - 'LIQUIDSPROD_BBL_10':'OILPROD_10','GASPROD_MCF_10':'GASPROD_10','WATERPROD_BBL_10':'WATERPROD_10', - 'LIQUIDSPROD_BBL_11':'OILPROD_11','GASPROD_MCF_11':'GASPROD_11','WATERPROD_BBL_11':'WATERPROD_11', - 'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12',}) - .assign(WELL_COUNT=1) - ) - # Format completion date (YYYY-MM) - for iwell in range(0,len(Prism_data)): - comp_date = str(Prism_data.loc[iwell, 'COMPDATE']) - if comp_date == 'NaN': - comp_year_month = 'NaN' - elif comp_date == 'nan': - comp_year_month = 'NaN' - else: # date format YYYY-MM-DD - comp_month = f"{int(comp_date.split('-')[1]):02}" - comp_year = f"{int(comp_date.split('-')[0])}" - comp_year_month = str(comp_year)+'-'+str(comp_month) - Prism_data.loc[iwell, 'comp_year_month'] = comp_year_month - # Format spud date (YYYY) - for iwell in range(0,len(Prism_data)): - spud_date = str(Prism_data.loc[iwell, 'SPUDDATE']) - if spud_date == 'NaN': - spud_year = 'NaN' - elif spud_date == 'nan': - spud_year = 'NaN' - else: # date format YYYY-MM-DD - spud_year = f"{int(spud_date.split('-')[0])}" - spud_year = str(spud_year) - Prism_data.loc[iwell, 'spud_year'] = spud_year - # Format first production date (YYYY) - for iwell in range(0,len(Prism_data)): - first_prod_date = str(Prism_data.loc[iwell, 'FIRSTPRODDATE']) - if first_prod_date == 'NaN': - first_prod_year = 'NaN' - elif first_prod_date == 'nan': - first_prod_year = 'NaN' - else: # date format YYYY-MM-DD - first_prod_year = f"{int(first_prod_date.split('-')[0])}" - first_prod_year = str(first_prod_year) - Prism_data.loc[iwell, 'first_prod_year'] = first_prod_year - Prism_data_dict[f'{iyear}'] = Prism_data - - # Combine into one array with common column names, replace nans with zeros, and sum annual production - Enverus_data = pd.concat([DI_data, Prism_data], ignore_index=True) - Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')].fillna(0) - Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')].fillna(0) - Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')].fillna(0) - - # Calculate cummulative annual production totals for Gas, Oil, Water - Enverus_data['CUM_GAS'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('GASPROD_')].sum(1) - Enverus_data['CUM_OIL'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('OILPROD_')].sum(1) - Enverus_data['CUM_WATER'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('WATERPROD_')].sum(1) - - Enverus_data['NEMS_CODE'] = Enverus_data['NEMS_REGION'].map(NEMS_dict) - - # Save out the data for that year - Enverus_data_dict[f'{iyear}'] = Enverus_data - - del Prism_data - del DI_data #save memory space - - #define default values for a new row in this table (to be used later during data corrections) - default = {'WELL_COUNT': 0, 'STATE_CODE':'','COUNTY':'','NEMS_REGION':'UNK', - 'AAPG_CODE_ERG':'UNK','LATITUDE':0,'LONGITUDE':0, - 'PRODUCING_STATUS':'','BASIN':'','SPUDDATE':'','COMPDATE':'', - 'FIRSTPRODDATE':'','HF':'', 'OFFSHORE':'','GOR':-99, - 'GOR_QUAL':'','PROD_FLAG':'','PRODYEAR':'', - 'OILPROD_01':0, 'GASPROD_01':0, 'WATERPROD_01':0,'OILPROD_02':0, 'GASPROD_02':0, 'WATERPROD_02':0, - 'OILPROD_03':0, 'GASPROD_03':0, 'WATERPROD_03':0,'OILPROD_04':0, 'GASPROD_04':0, 'WATERPROD_04':0,\ - 'OILPROD_05':0, 'GASPROD_05':0, 'WATERPROD_05':0,'OILPROD_06':0, 'GASPROD_06':0, 'WATERPROD_06':0,\ - 'OILPROD_07':0, 'GASPROD_07':0, 'WATERPROD_07':0,'OILPROD_08':0, 'GASPROD_08':0, 'WATERPROD_08':0,\ - 'OILPROD_09':0, 'GASPROD_09':0, 'WATERPROD_09':0,'OILPROD_10':0, 'GASPROD_10':0, 'WATERPROD_10':0,\ - 'OILPROD_11':0, 'GASPROD_11':0, 'WATERPROD_11':0,'OILPROD_12':0, 'GASPROD_12':0, 'WATERPROD_12':0, - 'CUM_GAS':0, 'CUM_OIL':0, 'CUM_WATER':0, 'NEMS_CODE':99} - - # Correct the NEMS Code for missing NEMS_REGIONS - # Note OFFSHORE regions will have NaN as NEMS_Code - for iyear in years: - enverus_data_temp = Enverus_data_dict[f'{iyear}'] - list_well = enverus_data_temp.index[pd.isna(enverus_data_temp.loc[:,'NEMS_REGION'])].tolist() - if np.size(list_well) > 0: - for irow in list_well: - match_state = np.where(NEMS_State['State_Code']==enverus_data_temp['STATE_CODE'][irow])[0][0] - enverus_data_temp.loc[irow,'NEMS_CODE'] = NEMS_State['NEMS'][match_state].astype(int) - Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy() - - # STEP 2.3.2: Correct Enverus Data for Select States - - # 1) Read In Coverage Table from State Well Counts File from ERG - # (specifies the first year with bad data and which years need to be corrected; - # all years including and after the first bad year of data need to be corrected) - - ERG_StateWellCounts_LastGoodDataYear = (pd.read_excel( - enverus_well_counts_path, - sheet_name = "2021 - Coverage", - usecols = {"State","Last Good Year"}, - skiprows = 2, - nrows = 40) - ) - - # 2) Loops through the each state and year in Enverus to determine if the data for that particualar year needs to - # be corrected. At the moment, the only corrections ERG makes to the data is to use the prior year of data if there - # is no new Enverus data reportd for that state. If a particular state is not included for any years in the Enverus - # dataset, then a row of zeros is added to the Enverus table for that year. - - for istate in np.arange(0,len(state_gdf)): - correctdata =0 - istate_code = state_gdf['state_code'][istate] - lastgoodyear = ERG_StateWellCounts_LastGoodDataYear['Last Good Year'][ERG_StateWellCounts_LastGoodDataYear['State'] == istate_code].values - if lastgoodyear == max_year: - lastgoodyear = max_year+5 #if state isn't included in correction list, don't correct any data - - for iyear in years: - enverus_data_temp= Enverus_data_dict[f'{iyear}'].copy() - state_list = np.unique(enverus_data_temp['STATE_CODE']) - if istate_code in state_list: - inlist =1 - else: - inlist = 0 - if inlist ==1 or correctdata==1: #if the state is included in Enverus data, or had data for at least one good year - #if first year, correctdata will be zero, but inlist will also be zero if no Enverus data - #check to see whether corrections are necessary for the given year/state - if iyear == (lastgoodyear): - print(istate_code,iyear,'last good year') - # This is the last year of good data. Do not correct the data but save - # but so that this data can be used for all following years for that state - temp_data = enverus_data_temp[enverus_data_temp['STATE_CODE'] == istate_code] - correctdata=1 - elif iyear > lastgoodyear: - print(istate_code,iyear) - #correct data for all years equal to and after the first bad year (remove old data first if necessary) - if inlist == 1: - enverus_data_temp = enverus_data_temp[enverus_data_temp['STATE_CODE'] != istate_code] - enverus_data_temp = pd.concat([enverus_data_temp,temp_data],ignore_index=True) - print(istate_code +' data for ' +str(iyear) +' were corrected with '+str(lastgoodyear)+' data') - else: - # year_range[iyear] < firstbadyear: - #no data corrections if the current year is before the first bad year - #print('no corrections') - #print(state_str,year_range[iyear]) - no_corrections =1 - - if inlist==0 and correctdata==0: - #if there is no Enverus data for a given state, and there was no good data, add a row with default values - # temp_row = {'STATE':istate_code} - # enverus_data_temp = enverus_data_temp.append({**default,**temp_row}, ignore_index=True) - print(istate_code +' has no Enverus data in the year ' +str(iyear)) - - #resave that year of Enverus data - enverus_data_temp.reset_index(drop=True,inplace=True) - Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy() - - # STEP 2.4: Calculate Fractional Monthly Condensate Arrays - # (EIA condensate production (bbl) relative to producing Enverus gas wells by month - # in each state and region) - - # TODO: fractional monthly condensate array code - - # STEP 2.5: Convert Enverus Well Production Arrays and Condensate Array into Gridded - # Location Arrays - - # clear variables - # del ERG_StateWellCounts_FirstBadDataYear - # del Prism_data - # del colnames - # del names - # del state_condensates - # del temp_data - - # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) - # Includes NA Gas Wells and Production onshore in the CONUS region - # source emissions are related to the presence of a well and its production status (no emission if no production) - # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. - # Wells are not considered active for a given year if there is no production data that year - # This may cause wells that are completed but not yet producing to be dropped from the national count. - # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the - # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' - # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), - # but the presence of a well will only be included in maps in months where monthly gas prod > 0 - - # Proxy Data Dataframes: - # Well Counts - all_well_count_df = pd.DataFrame() # Active gas well (conventional + HF) counts in a given month - conv_well_count_df = pd.DataFrame() # Active conventional gas well counts in a given month - hf_well_count_df = pd.DataFrame() # Active HF gas well counts in a given month - # Well-Level Production Volumes - all_well_prod_df = pd.DataFrame() # Active gas well (conventional + HF) gas production in a given month - basin_220_prod_df = pd.DataFrame() # Gas well gas production in Basin 220 in a given month - basin_395_prod_df = pd.DataFrame() # Gas well gas production in Basin 395 in a given month - basin_430_prod_df = pd.DataFrame() # Gas well gas production in Basin 430 in a given month - basin_other_prod_df = pd.DataFrame() # Gas well gas production in Other Basins in a given month - # Water Production Volumes - water_prod_df = pd.DataFrame() - # Well Completions - conv_well_comp_df = pd.DataFrame() # Conventional gas well completions - hf_well_comp_df = pd.DataFrame() # HF gas well completions - # Drilled Gas Wells - drilled_well_df = pd.DataFrame() # Gas wells drilled - # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico - state_gom_offshore_well_count_df = pd.DataFrame() # Offshore state GOM gas well counts - state_gom_offshore_well_prod_df = pd.DataFrame() # Offshore state GOM gas production - - - # Query Enverus data to create dictionaries of proxy data - for iyear in years: - enverus_data_temp = Enverus_data_dict[f'{iyear}'].copy() - - # Onshore Natural Gas - ng_data_temp = (enverus_data_temp - .query("STATE_CODE.isin(@state_gdf['state_code'])") - .query("OFFSHORE == 'N'") - .query("CUM_GAS > 0") - .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) - .assign(year=str(iyear)) - .replace(np.inf, 0) - .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") - ) - # Offshore Natural Gas Wells - ng_offshore_data_temp = (enverus_data_temp - .query("STATE_CODE.isin(@state_gdf['state_code'])") - .query("OFFSHORE == 'Y'") - .query("CUM_GAS > 0") - .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) - .assign(year=str(iyear)) - .replace(np.inf, 0) - .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") - ) - - # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) - for imonth in range(1,13): - imonth_str = f"{imonth:02}" # convert to 2-digit months - year_month_str = str(iyear)+'-'+imonth_str - gas_prod_str = 'GASPROD_'+imonth_str - water_prod_str = 'WATERPROD_'+imonth_str - # onshore data for imonth - ng_data_imonth_temp = (ng_data_temp - .query(f"{prod_str} > 0") - .assign(year_month=str(iyear)+'-'+imonth_str) - ) - ng_data_imonth_temp = (ng_data_imonth_temp[[ - 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', - 'HF','WELL_COUNT',gas_prod_str,water_prod_str, - 'comp_year_month','spud_year','first_prod_year']] - ) - # offshore data for imonth - ng_offshore_data_imonth_temp = (ng_offshore_data_temp - .query(f"{prod_str} > 0") - .assign(year_month=str(iyear)+'-'+imonth_str) - ) - ng_data_imonth_temp = (ng_offshore_data_imonth_temp[[ - 'year','year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', - 'HF','WELL_COUNT',gas_prod_str,water_prod_str, - 'comp_year_month','spud_year','first_prod_year']] - ) - # Well Counts - # All Gas Well Count - all_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']] - .rename(columns=lambda x: str(x).lower()) - .rename(columns={"well_count":"proxy_data"}) - .reset_index(drop=True) - ) - all_well_count_df = pd.concat([all_well_count_df,all_well_count_imonth]) - # Conventional Gas Well Count - conv_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']] - .query("HF != 'Y'") - .drop(columns=["HF"]) - .rename(columns=lambda x: str(x).lower()) - .rename(columns={"well_count":"proxy_data"}) - .reset_index(drop=True) - ) - conv_well_count_df = pd.concat([conv_well_count_df,conv_well_count_imonth]) - # HF Gas Well Count - hf_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']] - .query("HF == 'Y'") - .drop(columns=["HF"]) - .rename(columns=lambda x: str(x).lower()) - .rename(columns={"well_count":"proxy_data"}) - .reset_index(drop=True) - ) - hf_well_count_df = pd.concat([hf_well_count_df,hf_well_count_imonth]) - - # Gas Production - # All Gas Well Gas Production - all_well_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]] - .assign(proxy_data=lambda df: df[gas_prod_str]) - .drop(columns=[gas_prod_str]) - .rename(columns=lambda x: str(x).lower()) - .reset_index(drop=True) - ) - all_well_prod_df = pd.concat([all_well_prod_df,all_well_prod_imonth]) - # Basin 220 Gas Well Gas Production - basin_220_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] - .query("AAPG_CODE_ERG == '220'") - .assign(proxy_data=lambda df: df[gas_prod_str]) - .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) - .rename(columns=lambda x: str(x).lower()) - .reset_index(drop=True) - ) - basin_220_prod_df = pd.concat([basin_220_prod_df,basin_220_prod_imonth]) - # Basin 395 Gas Well Gas Production - basin_395_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] - .query("AAPG_CODE_ERG == '395'") - .assign(proxy_data=lambda df: df[gas_prod_str]) - .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) - .rename(columns=lambda x: str(x).lower()) - .reset_index(drop=True) - ) - basin_395_prod_df = pd.concat([basin_395_prod_df,basin_395_prod_imonth]) - # Basin 430 Gas Well Gas Production - basin_430_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] - .query("AAPG_CODE_ERG == '430'") - .assign(proxy_data=lambda df: df[gas_prod_str]) - .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) - .rename(columns=lambda x: str(x).lower()) - .reset_index(drop=True) - ) - basin_430_prod_df = pd.concat([basin_430_prod_df,basin_430_prod_imonth]) - # Other Basins Gas Well Gas Production - basin_other_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]] - .query("AAPG_CODE_ERG != '220' & AAPG_CODE_ERG != '395' & AAPG_CODE_ERG != '430'") - .assign(proxy_data=lambda df: df[gas_prod_str]) - .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG']) - .rename(columns=lambda x: str(x).lower()) - .reset_index(drop=True) - ) - basin_other_prod_df = pd.concat([basin_other_prod_df,basin_other_prod_imonth]) - - # Water Production - # Data Source by state defined in Enverus DrillingInfo Processing - Produced - # Water_2023-11-14_forGridding.xlsx file. - if iyear < 2016: # WV uses NEI data - water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA', - 'MI','MO','MS','MT','ND','NE','NM','NV', - 'NY','OH','SD','TX','UT','VA','WY' - ] - # States using NEI for reference: ['IL','IN','KS','OK','PA','WV'] - else: # 2016 and beyond; WV uses Enverus data - water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA', - 'MI','MO','MS','MT','ND','NE','NM','NV', - 'NY','OH','SD','TX','UT','VA','WY','WV' - ] #WV uses Enverus - # States using NEI for reference: ['IL','IN','KS','OK','PA'] - # Enverus water production for applicable states (NEI water producted will - # be added in the NEI section of the code below) - water_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',water_prod_str]] - .query("STATE_CODE.isin(@water_prod_enverus_states)") - .assign(proxy_data=lambda df: df[water_prod_str]) - .drop(columns=[water_prod_str]) - .rename(columns=lambda x: str(x).lower()) - .reset_index(drop=True) - ) - water_prod_df = pd.concat([water_prod_df,water_prod_imonth]) - - # Well Completions - # Conventional Gas Well Completions - conv_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']] - .query("HF != 'Y'") - .drop(columns=["HF"]) - .rename(columns=lambda x: str(x).lower()) - .rename(columns={"well_count":"proxy_data"}) - .query(f"comp_year_month == {year_month_str}") - .drop(columns=["comp_year_month"]) - .reset_index(drop=True) - ) - conv_well_comp_df = pd.concat([conv_well_comp_df,conv_well_comp_imonth]) - - # HF Gas Well Completions - hf_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']] - .query("HF == 'Y'") - .drop(columns=["HF"]) - .rename(columns=lambda x: str(x).lower()) - .rename(columns={"well_count":"proxy_data"}) - .query(f"comp_year_month == '{year_month_str}'") - .drop(columns=["comp_year_month"]) - .reset_index(drop=True) - ) - hf_well_comp_df = pd.concat([hf_well_comp_df,hf_well_comp_imonth]) - - # Drilled Gas Wells - drilled_well_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','spud_year','first_prod_year']] - .rename(columns=lambda x: str(x).lower()) - .rename(columns={"well_count":"proxy_data"}) - # wells with a spud date or first production date in the current year - .query(f"spud_year == '{iyear}' | first_prod_year == '{iyear}'") - # wells with a spud_year == iyear or if no spud date, first_prod_year == iyear - .query(f"spud_year == '{iyear}' | spud_year == 'NaN'") - .drop(columns=['hf', 'spud_year', 'first_prod_year']) - .reset_index(drop=True) - ) - drilled_well_df = pd.concat([drilled_well_df,drilled_well_imonth]) - - # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico - state_gom_offshore_states = ['AL','FL','LA','MS','TX'] - # Offshore State GOM Gas Well Counts - state_gom_offshore_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']] - .rename(columns=lambda x: str(x).lower()) - .rename(columns={"well_count":"proxy_data"}) - .reset_index(drop=True) - ) - state_gom_offshore_well_count_df = pd.concat([state_gom_offshore_well_count_df,state_gom_offshore_well_count_imonth]) - # Offshore State GOM Gas Well Gas Production - state_gom_offshore_well_prod_imonth = (ng_offshore_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]] - .query("STATE_CODE.isin(@state_gom_offshore_states)") - .assign(proxy_data=lambda df: df[gas_prod_str]) - .drop(columns=[gas_prod_str]) - .rename(columns=lambda x: str(x).lower()) - .reset_index(drop=True) - ) - state_gom_offshore_well_prod_df = pd.concat([state_gom_offshore_well_prod_df,state_gom_offshore_well_prod_imonth]) - - # Calculate Relative Emissions - def calc_enverus_rel_emi(df): - df['rel_emi'] = df.groupby(["state_code", "year"])['proxy_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) - df = df.drop(columns='proxy_data') - return df - - # Well Counts - all_well_count_df = calc_enverus_rel_emi(all_well_count_df) - conv_well_count_df = calc_enverus_rel_emi(conv_well_count_df) - hf_well_count_df = calc_enverus_rel_emi(hf_well_count_df) - # Well-Level Production Volumes - all_well_prod_df = calc_enverus_rel_emi(all_well_prod_df) - basin_220_prod_df = calc_enverus_rel_emi(basin_220_prod_df) - basin_395_prod_df = calc_enverus_rel_emi(basin_395_prod_df) - basin_430_prod_df = calc_enverus_rel_emi(basin_430_prod_df) - basin_other_prod_df = calc_enverus_rel_emi(basin_other_prod_df) - # Water Production Volumes - water_prod_df = calc_enverus_rel_emi(water_prod_df) - # Well Completions - conv_well_comp_df = calc_enverus_rel_emi(conv_well_comp_df) - hf_well_comp_df = calc_enverus_rel_emi(hf_well_comp_df) - # Drilled Gas Wells - drilled_well_df = calc_enverus_rel_emi(drilled_well_df) - # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico - state_gom_offshore_well_count_df = calc_enverus_rel_emi(state_gom_offshore_well_count_df) - state_gom_offshore_well_prod_df = calc_enverus_rel_emi(state_gom_offshore_well_prod_df) - - # Format Proxy Data into Geodataframes - def enverus_df_to_gdf(df): - gdf = ( - gpd.GeoDataFrame( - df, - geometry=gpd.points_from_xy( - df["longitude"], - df["latitude"], - crs=4326 - ) - ) - .drop(columns=["latitude", "longitude"]) - .loc[:, ["year", "year_month", "state_code", "rel_emi", "geometry"]] - ) - return gdf - - # Well Counts - all_well_count_gdf = enverus_df_to_gdf(all_well_count_df) - conv_well_count_gdf = enverus_df_to_gdf(conv_well_count_df) - hf_well_count_gdf = enverus_df_to_gdf(hf_well_count_df) - # Well-Level Production Volumes - all_well_prod_gdf = enverus_df_to_gdf(all_well_prod_df) - basin_220_prod_gdf = enverus_df_to_gdf(basin_220_prod_df) - basin_395_prod_gdf = enverus_df_to_gdf(basin_395_prod_df) - basin_430_prod_gdf = enverus_df_to_gdf(basin_430_prod_df) - basin_other_prod_gdf = enverus_df_to_gdf(basin_other_prod_df) - # Water Production Volumes - water_prod_gdf = enverus_df_to_gdf(water_prod_df) - # Well Completions - conv_well_comp_gdf = enverus_df_to_gdf(conv_well_comp_df) - hf_well_comp_gdf = enverus_df_to_gdf(hf_well_comp_df) - # Drilled Gas Wells - drilled_well_gdf = enverus_df_to_gdf(drilled_well_df) - # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico - state_gom_offshore_well_count_gdf = enverus_df_to_gdf(state_gom_offshore_well_count_df) - state_gom_offshore_well_prod_gdf = enverus_df_to_gdf(state_gom_offshore_well_prod_df) - - # STEP 2.4: Well and Production Data (from NEI) - - # NEI data is used for well counts, gas well completion counts, - # gas well drilled counts, and gas production volumes for IL and IN. - - # NEI data is used for water production volumes for IL, IN, KS, OK, and PA - # as well as WV for years less than 2016. - - # FIPS codes for relevant states (each code starts with 2 distinct characters): - # IL: 17; IN: 18; KS: 20; OK: 40; PA: 42; WV: 54 - - fips_codes_df = pd.DataFrame({'state_code': ['IL', 'IN', 'KS', 'OK', 'PA', 'WV'], - 'fips_code': ['17', '18', '20', '40', '42', '54']}) - - # Function to get NEI textfile and shapefile data - def get_NEI_data(ghgi_year, data_year, file_name): - if data_year <= 2017: - # NEI textfile data (data_year <= 2017) (2011, 2014, 2016, 2017) - nei_textfile_name = f"CONUS_SA_FILES_{data_year}/{file_name}" - nei_textfile_path = os.path.join(nei_path, nei_textfile_name) - data_temp = pd.read_csv(nei_textfile_path, sep='\t', skiprows = 25) - data_temp = data_temp.drop(["!"], axis=1) - data_temp.columns = ['Code','FIPS','COL','ROW','Frac','Abs','FIPS_Total','FIPS_Running_Sum'] - data_temp = data_temp.astype({"FIPS": str}) - # if water production data (gas: 6832, oil: 6833) - if file_name == 'USA_6832_NOFILL.txt' or file_name == 'USA_6833_NOFILL.txt': - if data_year < 2016: - data_temp = (data_temp - # query states: IL, IN, KS, OK, PA, WV - .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42') | FIPS.str.startswith('54')") - .reset_index(drop=True) - ) - colmax = data_temp['COL'].max() - colmin = data_temp['COL'].min() - rowmax = data_temp['ROW'].max() - rowmin = data_temp['ROW'].min() - else: - data_temp = (data_temp - # query states: IL, IN, KS, OK, PA - .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42')") - .reset_index(drop=True) - ) - colmax = data_temp['COL'].max() - colmin = data_temp['COL'].min() - rowmax = data_temp['ROW'].max() - rowmin = data_temp['ROW'].min() - # non-water production proxies (IL, IN) - else: - data_temp = (data_temp - # query states: IL, IN - .query("FIPS.str.startswith('17') | FIPS.str.startswith('18')") - .reset_index(drop=True) - ) - colmax = data_temp['COL'].max() - colmin = data_temp['COL'].min() - rowmax = data_temp['ROW'].max() - rowmin = data_temp['ROW'].min() - # NEI reference grid shapefile with lat/lon locations - nei_reference_grid_path = os.path.join(nei_path, "NEI_Reference_Grid_LCC_to_WGS84_latlon.shp") - nei_reference_grid = (gpd.read_file(nei_reference_grid_path) - .to_crs(4326)) - nei_reference_grid = (nei_reference_grid - .assign(cellid_column = nei_reference_grid.cellid.astype(str).str[0:4].astype(int)) - .assign(cellid_row = nei_reference_grid.cellid.astype(str).str[5:].astype(int)) - .query(f"cellid_column <= {colmax} & cellid_column >= {colmin}") - .query(f"cellid_row <= {rowmax} & cellid_row >= {rowmin}") - .reset_index(drop=True) - ) - # Match lat/lon locations from reference grid to nei data - for idx in np.arange(0,len(data_temp)): - # Add in lat/lon - icol = data_temp['COL'][idx] - irow = data_temp['ROW'][idx] - match = np.where((icol == nei_reference_grid.loc[:,'cellid_column']) & (irow == nei_reference_grid.loc[:,'cellid_row']))[0][0] - match = int(match) - # data_temp.loc[idx,'Lat'] = nei_reference_grid.loc[match, 'Latitude'] - # data_temp.loc[idx,'Lon'] = nei_reference_grid.loc[match, 'Longitude'] - data_temp.loc[idx,'geometry'] = nei_reference_grid.loc[match, 'geometry'] - # Add in state_code - ifips = data_temp.loc[idx,'FIPS'][0:2] - data_temp.loc[idx,'state_code'] = fips_codes_df.loc[np.where(ifips == fips_codes_df.loc[:, 'fips_code'])[0][0],'state_code'] - data_temp = data_temp[['state_code', 'Abs', 'geometry']] - data_temp = data_temp.rename(columns={'Abs':'activity_data'}) - - else: - # NEI shapefile data (data_year > 2017) (2018, 2019, 2021, 2022) - state_geometries = state_gdf[["state_code","geometry"]] - nei_file_name = f"CONUS_SA_FILES_{data_year}" - nei_file_path = os.path.join(nei_path, nei_file_name) - data_temp = gpd.read_file(nei_file_path, layer=file_name) - data_temp = data_temp.to_crs(4326) - data_temp = gpd.tools.sjoin(data_temp, state_gdf, how="left") - - # water production data (IL, IN, KS, OK, PA) - if file_name == 'PRODUCED_WATER_GAS' or file_name == '_6832' or file_name == 'ProducedWaterGasWells': - states_to_query = ['IL', 'IN', 'KS', 'OK', 'PA'] - # non-water production proxies (IL, IN) - else: - states_to_query = ['IL', 'IN'] - - # query relevant states - data_temp = data_temp.query('state_code.isin(@states_to_query)') - - # grab activity data depending on column name (changes by year) - if data_year == 2018 or data_year == 2019 or data_year == 2020: - data_temp = data_temp[['state_code', 'ACTIVITY', 'geometry']] - data_temp = data_temp.rename(columns={'ACTIVITY':'activity_data'}) - if data_year == 2021: - data_temp = data_temp[['state_code', 'GRID_AC', 'geometry']] - data_temp = data_temp.rename(columns={'GRID_AC':'activity_data'}) - if data_year == 2022: - data_temp = data_temp[['state_code', 'GRID_ACTIV', 'geometry']] - data_temp = data_temp.rename(columns={'GRID_ACTIV':'activity_data'}) - - # convert activity data to relative emissions (idata / sum(state data)) - data_temp['rel_emi'] = data_temp.groupby(["state_code"])['activity_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) - monthly_data_temp = data_temp.copy() - monthly_data_temp['rel_emi'] = monthly_data_temp['rel_emi'] * 1/12 - monthly_data_temp = monthly_data_temp.drop(columns='activity_data') - - # convert proxy data to monthly (assume 1/12 of annual proxy is assigned to each month) - nei_proxy_data = pd.DataFrame() - for imonth in range(1,13): - imonth_str = f"{imonth:02}" # convert to 2-digit months - data_temp_imonth = monthly_data_temp.copy() - data_temp_imonth = data_temp_imonth.assign(year_month=str(ghgi_year)+'-'+imonth_str) - nei_proxy_data = pd.concat([nei_proxy_data,data_temp_imonth]) - nei_proxy_data = nei_proxy_data.assign(year=ghgi_year) - nei_proxy_data = (nei_proxy_data[['year', 'year_month', 'state_code', 'rel_emi', 'geometry']] - .reset_index(drop=True) - ) - return nei_proxy_data - - # NEI data year assignments - # All years use the data affiliated with their year except the following exceptions: - # 2012: use 2011 data - # 2013: use 2014 data - # 2015: use 2014 data - # 2016: use 2017 data - nei_data_years = pd.DataFrame({'year': [2012, - 2013, - 2014, - 2015, - 2016, - 2017, - 2018, - 2019, - 2020, - 2021, - 2022], - 'nei_data': [2011, - 2014, - 2014, - 2014, - 2017, - 2017, - 2018, - 2019, - 2020, - 2021, - 2022]}) - - # NEI Data Dataframes: - # Well Counts - nei_all_well_count_df = pd.DataFrame() # Active gas well (conventional + HF) counts in a given month - nei_conv_well_count_df = pd.DataFrame() # Active conventional gas well counts in a given month - nei_hf_well_count_df = pd.DataFrame() # Active HF gas well counts in a given month - # Well-Level Production Volumes - nei_all_well_prod_df = pd.DataFrame() # Active gas well (conventional + HF) gas production in a given month - nei_basin_other_prod_df = pd.DataFrame() # Gas well gas production in Other Basins in a given month - # Water Production Volumes - nei_water_prod_df = pd.DataFrame() - # Well Completions - nei_conv_well_comp_df = pd.DataFrame() # Conventional gas well completions - nei_hf_well_comp_df = pd.DataFrame() # HF gas well completions - # Drilled Gas Wells - nei_drilled_well_df = pd.DataFrame() # Gas wells drilled - - # NEI text file and shapefile names: - # Well Counts - well_count_file_names = pd.DataFrame({ - 'data_year': [2011, 2014, 2017, - 2018, 2019, 2020, 2021, 2022], - 'file_name': ['USA_698_NOFILL.txt', 'USA_698_NOFILL.txt', 'USA_698_NOFILL.txt', - 'GAS_WELLS', 'GAS_WELLS', 'GAS_WELL', '_698', 'GasWells'], - }) - # Well-Level Production Volumes - gas_prod_file_names = pd.DataFrame({ - 'data_year': [2011, 2014, 2017, - 2018, 2019, 2020, 2021, 2022], - 'file_name': ['USA_696_NOFILL.txt', 'USA_696_NOFILL.txt', 'USA_696_NOFILL.txt', - 'GAS_PRODUCTION', 'GAS_PRODUCTION', 'GAS_PRODUCTION', '_696', 'GasProduction'], - }) - # Water Production Volumes - water_prod_file_names = pd.DataFrame({ - 'data_year': [2011, 2014, 2017, - 2018, 2019, 2020, 2021, 2022], - 'file_name': ['USA_6832_NOFILL.txt', 'USA_6832_NOFILL.txt', 'USA_6832698_NOFILL.txt', - 'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', '_6832', 'ProducedWaterGasWells'], - }) - # Well Completions - comp_count_file_names = pd.DataFrame({ - 'data_year': [2011, 2014, 2017, - 2018, 2019, 2020, 2021, 2022], - 'file_name': ['USA_678_NOFILL.txt', 'USA_678_NOFILL.txt', 'USA_678_NOFILL.txt', - 'COMPLETIONS_GAS', 'COMPLETIONS_GAS', 'COMPLETIONS_GAS', '_678', 'GasWellCompletions'], - }) - # Drilled Gas Wells - spud_count_file_names = pd.DataFrame({ - 'data_year': [2011, 2014, 2017, - 2018, 2019, 2020, 2021, 2022], - 'file_name': ['USA_671_NOFILL.txt', 'USA_671_NOFILL.txt', 'USA_671_NOFILL.txt', - 'SPUD_GAS', 'SPUD_GAS', 'SPUD_GAS', '_671', 'SpudCountGasWells'], - }) - - - def get_nei_file_name(nei_data_year, nei_file_names): - nei_file_name = nei_file_names[nei_file_names['data_year'] == nei_data_year]['file_name'].values[0] - return nei_file_name - - - for iyear in years: - nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] - # Well Count - ifile_name = get_nei_file_name(nei_data_year, well_count_file_names) - nei_all_well_count_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) - nei_all_well_count_df = pd.concat([nei_all_well_count_df, nei_all_well_count_iyear]) - # Gas Production - ifile_name = get_nei_file_name(nei_data_year, gas_prod_file_names) - nei_all_well_prod_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) - nei_all_well_prod_df = pd.concat([nei_all_well_prod_df, nei_all_well_prod_iyear]) - # Water Production - ifile_name = get_nei_file_name(nei_data_year, water_prod_file_names) - nei_water_prod_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) - nei_water_prod_df = pd.concat([nei_water_prod_df, nei_water_prod_iyear]) - # Completions Count - ifile_name = get_nei_file_name(nei_data_year, comp_count_file_names) - nei_conv_well_comp_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) - nei_conv_well_comp_df = pd.concat([nei_conv_well_comp_df, nei_conv_well_comp_iyear]) - # Spud Count - ifile_name = get_nei_file_name(nei_data_year, spud_count_file_names) - nei_drilled_well_iyear = get_NEI_data(iyear, nei_data_year, ifile_name) - nei_drilled_well_df = pd.concat([nei_drilled_well_df, nei_drilled_well_iyear]) - - # Copy Data to Other Dataframes - nei_conv_well_count_df = nei_all_well_count_df.copy() - nei_hf_well_count_df = nei_all_well_count_df.copy() - nei_basin_other_prod_df = nei_all_well_prod_df.copy() - nei_hf_well_comp_df = nei_conv_well_comp_df.copy() - - # Add NEI Data to Enverus Data - # Well Counts - all_well_count_gdf = pd.concat([all_well_count_gdf, nei_all_well_count_df]).reset_index(drop=True) - conv_well_count_gdf = pd.concat([conv_well_count_gdf, nei_conv_well_count_df]).reset_index(drop=True) - hf_well_count_gdf = pd.concat([hf_well_count_gdf, nei_hf_well_count_df]).reset_index(drop=True) - # Well-Level Production Volumes - all_well_prod_gdf = pd.concat([all_well_prod_gdf, nei_all_well_prod_df]).reset_index(drop=True) - basin_220_prod_gdf = basin_220_prod_df.reset_index(drop=True) # No IL/IN data to add - basin_395_prod_gdf = basin_395_prod_df.reset_index(drop=True) # No IL/IN data to add - basin_430_prod_gdf = basin_430_prod_df.reset_index(drop=True) # No IL/IN data to add - basin_other_prod_gdf = pd.concat([basin_other_prod_gdf, nei_basin_other_prod_df]).reset_index(drop=True) - # Water Production Volumes - water_prod_gdf = pd.concat([water_prod_gdf, nei_water_prod_df]).reset_index(drop=True) - # Well Completions - conv_well_comp_gdf = pd.concat([conv_well_comp_gdf, nei_conv_well_comp_df]).reset_index(drop=True) - hf_well_comp_gdf = pd.concat([hf_well_comp_gdf, nei_hf_well_comp_df]).reset_index(drop=True) - # Drilled Gas Wells - drilled_well_gdf = pd.concat([drilled_well_gdf, nei_drilled_well_df]).reset_index(drop=True) - # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico - state_gom_offshore_well_count_gdf = state_gom_offshore_well_count_df.reset_index(drop=True) # No IL/IN data to add - state_gom_offshore_well_prod_gdf = state_gom_offshore_well_prod_df.reset_index(drop=True) # No IL/IN data to add - - # Output Proxy Parquet Files - all_well_count_gdf.to_parquet(all_well_count_output_path) - conv_well_count_gdf.to_parquet(conv_well_count_output_path) - hf_well_count_gdf.to_parquet(hf_well_count_output_path) - all_well_prod_gdf.to_parquet(all_well_prod_output_path) - basin_220_prod_gdf.to_parquet(basin_220_prod_output_path) - basin_395_prod_gdf.to_parquet(basin_395_prod_output_path) - basin_430_prod_gdf.to_parquet(basin_430_prod_output_path) - basin_other_prod_gdf.to_parquet(basin_other_prod_output_path) - water_prod_gdf.to_parquet(water_prod_output_path) - conv_well_comp_gdf.to_parquet(conv_well_comp_output_path) - hf_well_comp_gdf.to_parquet(hf_well_comp_output_path) - drilled_well_gdf.to_parquet(drilled_well_output_path) - state_gom_offshore_well_count_gdf.to_parquet(state_gom_offshore_well_count_output_path) - state_gom_offshore_well_prod_gdf.to_parquet(state_gom_offshore_well_prod_output_path) - return None - - - - - diff --git a/gch4i/proxy_processing/task_ng_water_prod_proxy.py b/gch4i/proxy_processing/task_ng_water_prod_proxy.py new file mode 100644 index 0000000..9f4ad24 --- /dev/null +++ b/gch4i/proxy_processing/task_ng_water_prod_proxy.py @@ -0,0 +1,194 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + ng_water_prod_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="ng_water_prod_proxy") +def task_get_ng_water_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + water_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_water_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + water_prod_df = pd.DataFrame() + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_GAS > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + gas_prod_str = 'GASPROD_'+imonth_str + water_prod_str = 'WATERPROD_'+imonth_str + # Onshore data for imonth + ng_data_imonth_temp = (ng_data_temp + .query(f"{gas_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + ng_data_imonth_temp = (ng_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',gas_prod_str,water_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Water Production + # Data Source by state defined in Enverus DrillingInfo Processing - Produced + # Water_2023-11-14_forGridding.xlsx file. + if iyear < 2016: # WV uses NEI data + water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA', + 'MI','MO','MS','MT','ND','NE','NM','NV', + 'NY','OH','SD','TX','UT','VA','WY' + ] + # States using NEI for reference: ['IL','IN','KS','OK','PA','WV'] + else: # 2016 and beyond; WV uses Enverus data + water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA', + 'MI','MO','MS','MT','ND','NE','NM','NV', + 'NY','OH','SD','TX','UT','VA','WY','WV' + ] #WV uses Enverus + # States using NEI for reference: ['IL','IN','KS','OK','PA'] + # Enverus water production for applicable states (NEI water producted will + # be added in the NEI section of the code below) + water_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',water_prod_str]] + .query("STATE_CODE.isin(@water_prod_enverus_states)") + .assign(proxy_data=lambda df: df[water_prod_str]) + .drop(columns=[water_prod_str]) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + water_prod_df = pd.concat([water_prod_df,water_prod_imonth]) + + # Delete unused temp data + del ng_data_temp + del ng_data_imonth_temp + del water_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + water_prod_df = calc_enverus_rel_emi(water_prod_df) + water_prod_df = enverus_df_to_gdf(water_prod_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Gas Production + ifile_name = get_nei_file_name(nei_data_year, ng_water_prod_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + water_prod_df = pd.concat([water_prod_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = water_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + water_prod_df = water_prod_df.astype({'year':str}) + water_prod_df.to_parquet(water_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_ng_well_blowout_proxy.py b/gch4i/proxy_processing/task_ng_well_blowout_proxy.py new file mode 100644 index 0000000..4d7604d --- /dev/null +++ b/gch4i/proxy_processing/task_ng_well_blowout_proxy.py @@ -0,0 +1,75 @@ +# %% +import calendar +import datetime +from pathlib import Path +from typing import Annotated +from zipfile import ZipFile + +import geopandas as gpd +import numpy as np +import osgeo +import pandas as pd +import seaborn as sns +from pyarrow import parquet +from pytask import Product, mark, task + +from gch4i.config import ( + V3_DATA_PATH, + ghgi_data_dir_path, + global_data_dir_path, + max_year, + min_year, + proxy_data_dir_path, +) +from gch4i.utils import name_formatter + +# %% + + +@mark.persist +@task(id="ng_well_blowout_proxy") +def task_get_ng_well_blowout_proxy_data( + output_path: Annotated[Path, Product] = (proxy_data_dir_path / "ng_well_blowout_proxy.parquet"), +): + """ + Well blowouts occur three times over 2012-2022. These locations and emissions are + provided directly by the GHGI sector leads and manually coded into the proxy. + + 1. LA in 2019 + state_code: LA; year: 2019; emi: 49 kt; lat: 32.1; lon: -93.4 + 2. OH in 2018 + state_code: OH; year: 2018; emi: 60 kt; lat: 39.864; lon: -80.861 + 3. TX 2019 + state_code: TX; year: 2019; emi: 4.8 kt; lat: 28.9; lon: -97.6 + + """ + + well_blowout_df = pd.DataFrame( + {'state_code': ['LA', 'OH', 'TX'], + 'year': [2019, 2018, 2019], + 'rel_emi': [1.0, 1.0, 1.0], # assign 100% of the emissions to each state/year combination + 'lat': [32.1, 39.864, 28.9], + 'lon': [-93.4, -80.861, -97.6], + }) + + well_blowout_gdf = (gpd.GeoDataFrame( + well_blowout_df, + geometry=gpd.points_from_xy( + well_blowout_df["lon"], + well_blowout_df["lat"], + crs=4326 + ) + ) + .drop(columns=["lat", "lon"]) + .loc[:, ["year", "state_code", "rel_emi", "geometry"]] + ) + + # Check that relative emissions sum to 1.0 each state/year combination + sums = well_blowout_gdf.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + well_blowout_gdf = well_blowout_gdf.astype({'year':str}) + well_blowout_gdf.to_parquet(output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_all_well_count_proxy.py b/gch4i/proxy_processing/task_oil_all_well_count_proxy.py new file mode 100644 index 0000000..299afc9 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_all_well_count_proxy.py @@ -0,0 +1,176 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + oil_well_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="oil_all_well_count_proxy") +def task_get_oil_all_well_count_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + all_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_all_well_count_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated oil well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly oil or ng prod > 0 + + # Proxy Data Dataframes: + all_well_count_df = pd.DataFrame() # Active well (conventional + HF) counts in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is oil production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # All Oil Well Count + all_well_count_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + all_well_count_df = pd.concat([all_well_count_df,all_well_count_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del all_well_count_imonth + + # Calculate relative emissions and convert to a geodataframe + all_well_count_df = calc_enverus_rel_emi(all_well_count_df) + all_well_count_df = enverus_df_to_gdf(all_well_count_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, oil_well_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + all_well_count_df = pd.concat([all_well_count_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = all_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + all_well_count_df = all_well_count_df.astype({'year':str}) + all_well_count_df.to_parquet(all_well_count_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_all_well_prod_proxy.py b/gch4i/proxy_processing/task_oil_all_well_prod_proxy.py new file mode 100644 index 0000000..bee4145 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_all_well_prod_proxy.py @@ -0,0 +1,177 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + oil_oil_prod_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="oil_all_well_prod_proxy") +def task_get_oil_all_well_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + all_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_all_well_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + all_well_prod_df = pd.DataFrame() # Active oil well (conventional + HF) oil production in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is oil production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # All Well Oil Production + all_well_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',oil_prod_str]] + .assign(proxy_data=lambda df: df[oil_prod_str]) + .drop(columns=[oil_prod_str]) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + all_well_prod_df = pd.concat([all_well_prod_df,all_well_prod_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del all_well_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + all_well_prod_df = calc_enverus_rel_emi(all_well_prod_df) + all_well_prod_df = enverus_df_to_gdf(all_well_prod_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Gas Production + ifile_name = get_nei_file_name(nei_data_year, oil_oil_prod_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + all_well_prod_df = pd.concat([all_well_prod_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = all_well_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + all_well_prod_df = all_well_prod_df.astype({'year':str}) + all_well_prod_df.to_parquet(all_well_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_basin_220_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_220_prod_proxy.py new file mode 100644 index 0000000..3258c9b --- /dev/null +++ b/gch4i/proxy_processing/task_oil_basin_220_prod_proxy.py @@ -0,0 +1,154 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, +) + +# %% +@mark.persist +@task(id="oil_basin_220_prod_proxy") +def task_get_oil_basin_220_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + basin_220_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_220_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + basin_220_prod_df = pd.DataFrame() # Oil well oil production in Basin 220 in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Basin 220 Oil Production + basin_220_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]] + .query("AAPG_CODE_ERG == '220'") + .assign(proxy_data=lambda df: df[oil_prod_str]) + .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_220_prod_df = pd.concat([basin_220_prod_df,basin_220_prod_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del basin_220_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + basin_220_prod_df = calc_enverus_rel_emi(basin_220_prod_df) + basin_220_prod_df = enverus_df_to_gdf(basin_220_prod_df) + + # NEI Data: + # No addition of NEI data because IL and IN are not in this basin. We are adding + # them to the "other" basin. + + # Check that relative emissions sum to 1.0 each state/year combination + sums = basin_220_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + basin_220_prod_df = basin_220_prod_df.astype({'year':str}) + basin_220_prod_df.to_parquet(basin_220_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_basin_360_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_360_prod_proxy.py new file mode 100644 index 0000000..f970fbf --- /dev/null +++ b/gch4i/proxy_processing/task_oil_basin_360_prod_proxy.py @@ -0,0 +1,154 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, +) + +# %% +@mark.persist +@task(id="oil_basin_360_prod_proxy") +def task_get_oil_basin_360_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + basin_360_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_360_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + basin_360_prod_df = pd.DataFrame() # Oil well oil production in Basin 360 in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Basin 360 Oil Production + basin_360_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]] + .query("AAPG_CODE_ERG == '360'") + .assign(proxy_data=lambda df: df[oil_prod_str]) + .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_360_prod_df = pd.concat([basin_360_prod_df,basin_360_prod_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del basin_360_prod_imonth + + # Calculate Relative Emissions + basin_360_prod_df = calc_enverus_rel_emi(basin_360_prod_df) + basin_360_prod_df = enverus_df_to_gdf(basin_360_prod_df) + + # NEI Data: + # No addition of NEI data because IL and IN are not in this basin. We are adding + # them to the "other" basin. + + # Check that relative emissions sum to 1.0 each state/year combination + sums = basin_360_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + basin_360_prod_df = basin_360_prod_df.astype({'year':str}) + basin_360_prod_df.to_parquet(basin_360_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_basin_395_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_395_prod_proxy.py new file mode 100644 index 0000000..c5bea89 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_basin_395_prod_proxy.py @@ -0,0 +1,154 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, +) + +# %% +@mark.persist +@task(id="oil_basin_395_prod_proxy") +def task_get_oil_basin_395_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + basin_395_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_395_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + basin_395_prod_df = pd.DataFrame() # Oil well oil production in Basin 395 in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Basin 395 Oil Production + basin_395_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]] + .query("AAPG_CODE_ERG == '395'") + .assign(proxy_data=lambda df: df[oil_prod_str]) + .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_395_prod_df = pd.concat([basin_395_prod_df,basin_395_prod_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del basin_395_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + basin_395_prod_df = calc_enverus_rel_emi(basin_395_prod_df) + basin_395_prod_df = enverus_df_to_gdf(basin_395_prod_df) + + # NEI Data: + # No addition of NEI data because IL and IN are not in this basin. We are adding + # them to the "other" basin. + + # Check that relative emissions sum to 1.0 each state/year combination + sums = basin_395_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + basin_395_prod_df = basin_395_prod_df.astype({'year':str}) + basin_395_prod_df.to_parquet(basin_395_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_basin_430_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_430_prod_proxy.py new file mode 100644 index 0000000..f5083d6 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_basin_430_prod_proxy.py @@ -0,0 +1,154 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, +) + +# %% +@mark.persist +@task(id="oil_basin_430_prod_proxy") +def task_get_oil_basin_430_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + basin_430_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_430_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + basin_430_prod_df = pd.DataFrame() # Oil well oil production in Basin 430 in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Basin 430 Oil Production + basin_430_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]] + .query("AAPG_CODE_ERG == '430'") + .assign(proxy_data=lambda df: df[oil_prod_str]) + .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_430_prod_df = pd.concat([basin_430_prod_df,basin_430_prod_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del basin_430_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + basin_430_prod_df = calc_enverus_rel_emi(basin_430_prod_df) + basin_430_prod_df = enverus_df_to_gdf(basin_430_prod_df) + + # NEI Data: + # No addition of NEI data because IL and IN are not in this basin. We are adding + # them to the "other" basin. + + # Check that relative emissions sum to 1.0 each state/year combination + sums = basin_430_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + basin_430_prod_df = basin_430_prod_df.astype({'year':str}) + basin_430_prod_df.to_parquet(basin_430_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_basin_other_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_other_prod_proxy.py new file mode 100644 index 0000000..e7fde25 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_basin_other_prod_proxy.py @@ -0,0 +1,178 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + oil_oil_prod_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="oil_basin_other_prod_proxy") +def task_get_oil_basin_other_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + basin_other_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_other_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + basin_other_prod_df = pd.DataFrame() # Oil well oil production in Other Basins in a given month + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # "Other" Basin Gas Production + basin_other_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]] + .query("AAPG_CODE_ERG != '220' & AAPG_CODE_ERG != '360' & AAPG_CODE_ERG != '395' & AAPG_CODE_ERG != '430'") + .assign(proxy_data=lambda df: df[oil_prod_str]) + .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG']) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + basin_other_prod_df = pd.concat([basin_other_prod_df,basin_other_prod_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del basin_other_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + basin_other_prod_df = calc_enverus_rel_emi(basin_other_prod_df) + basin_other_prod_df = enverus_df_to_gdf(basin_other_prod_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Gas Production + ifile_name = get_nei_file_name(nei_data_year, oil_oil_prod_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + basin_other_prod_df = pd.concat([basin_other_prod_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = basin_other_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + basin_other_prod_df = basin_other_prod_df.astype({'year':str}) + basin_other_prod_df.to_parquet(basin_other_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_conv_well_comp_proxy.py b/gch4i/proxy_processing/task_oil_conv_well_comp_proxy.py new file mode 100644 index 0000000..6129357 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_conv_well_comp_proxy.py @@ -0,0 +1,180 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + oil_comp_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="oil_conv_well_comp_proxy") +def task_get_oil_conv_well_comp_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + conv_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_conv_well_comp_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + conv_well_comp_df = pd.DataFrame() # Conventional well completions + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Conventional Well Completions + conv_well_comp_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']] + .query("HF != 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .query(f"comp_year_month == '{year_month_str}'") + .drop(columns=["comp_year_month"]) + .reset_index(drop=True) + ) + conv_well_comp_df = pd.concat([conv_well_comp_df,conv_well_comp_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del conv_well_comp_imonth + + # Calculate relative emissions and convert to a geodataframe + conv_well_comp_df = calc_enverus_rel_emi(conv_well_comp_df ) + conv_well_comp_df = enverus_df_to_gdf(conv_well_comp_df ) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, oil_comp_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + conv_well_comp_df = pd.concat([conv_well_comp_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = conv_well_comp_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + conv_well_comp_df = conv_well_comp_df.astype({'year':str}) + conv_well_comp_df.to_parquet(conv_well_comp_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_conv_well_count_proxy.py b/gch4i/proxy_processing/task_oil_conv_well_count_proxy.py new file mode 100644 index 0000000..52fd54a --- /dev/null +++ b/gch4i/proxy_processing/task_oil_conv_well_count_proxy.py @@ -0,0 +1,177 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + oil_well_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="oil_conv_well_count_proxy") +def task_get_oil_conv_well_count_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + conv_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_conv_well_count_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated oil well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly oil or ng prod > 0 + + # Proxy Data Dataframes: + conv_well_count_df = pd.DataFrame() # Active conventional well counts in a given month + + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is oil production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Conventional Well Count + conv_well_count_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']] + .query("HF != 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + conv_well_count_df = pd.concat([conv_well_count_df,conv_well_count_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del conv_well_count_imonth + + # Calculate relative emissions and convert to a geodataframe + conv_well_count_df = calc_enverus_rel_emi(conv_well_count_df) + conv_well_count_df = enverus_df_to_gdf(conv_well_count_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, oil_well_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + conv_well_count_df = pd.concat([conv_well_count_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = conv_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + conv_well_count_df = conv_well_count_df.astype({'year':str}) + conv_well_count_df.to_parquet(conv_well_count_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_drilled_well_proxy.py b/gch4i/proxy_processing/task_oil_drilled_well_proxy.py new file mode 100644 index 0000000..9ba7de2 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_drilled_well_proxy.py @@ -0,0 +1,181 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + oil_spud_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="oil_drilled_well_proxy") +def task_get_oil_drilled_well_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + drilled_well_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_drilled_well_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + drilled_well_df = pd.DataFrame() # Gas wells drilled + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Drilled Oil Wells + drilled_well_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','spud_year','first_prod_year']] + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + # wells with a spud date or first production date in the current year + .query(f"spud_year == '{iyear}' | first_prod_year == '{iyear}'") + # wells with a spud_year == iyear or if no spud date, first_prod_year == iyear + .query(f"spud_year == '{iyear}' | spud_year == 'NaN' | spud_year == 'nan'") + .drop(columns=['hf', 'spud_year', 'first_prod_year']) + .reset_index(drop=True) + ) + drilled_well_df = pd.concat([drilled_well_df,drilled_well_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del drilled_well_imonth + + # Calculate relative emissions and convert to a geodataframe + drilled_well_df = calc_enverus_rel_emi(drilled_well_df) + drilled_well_df = enverus_df_to_gdf(drilled_well_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, oil_spud_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + drilled_well_df = pd.concat([drilled_well_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = drilled_well_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + drilled_well_df = drilled_well_df.astype({'year':str}) + drilled_well_df.to_parquet(drilled_well_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_hf_well_comp_proxy.py b/gch4i/proxy_processing/task_oil_hf_well_comp_proxy.py new file mode 100644 index 0000000..5b4ab30 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_hf_well_comp_proxy.py @@ -0,0 +1,180 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + oil_comp_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="oil_hf_well_comp_proxy") +def task_get_oil_hf_well_comp_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + hf_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_hf_well_comp_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + hf_well_comp_df = pd.DataFrame() # HF well completions + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # HF Well Completions + hf_well_comp_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']] + .query("HF == 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .query(f"comp_year_month == '{year_month_str}'") + .drop(columns=["comp_year_month"]) + .reset_index(drop=True) + ) + hf_well_comp_df = pd.concat([hf_well_comp_df,hf_well_comp_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del hf_well_comp_imonth + + # Calculate relative emissions and convert to a geodataframe + hf_well_comp_df = calc_enverus_rel_emi(hf_well_comp_df ) + hf_well_comp_df = enverus_df_to_gdf(hf_well_comp_df ) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, oil_comp_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + hf_well_comp_df = pd.concat([hf_well_comp_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = hf_well_comp_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + hf_well_comp_df = hf_well_comp_df.astype({'year':str}) + hf_well_comp_df.to_parquet(hf_well_comp_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_hf_well_count_proxy.py b/gch4i/proxy_processing/task_oil_hf_well_count_proxy.py new file mode 100644 index 0000000..3d17a08 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_hf_well_count_proxy.py @@ -0,0 +1,177 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + oil_well_count_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="oil_hf_well_count_proxy") +def task_get_oil_hf_well_count_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + hf_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_hf_well_count_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated oil well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly oil or ng prod > 0 + + # Proxy Data Dataframes: + hf_well_count_df = pd.DataFrame() # Active HF well counts in a given month + + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is oil production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # HF Well Count + hf_well_count_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']] + .query("HF == 'Y'") + .drop(columns=["HF"]) + .rename(columns=lambda x: str(x).lower()) + .rename(columns={"well_count":"proxy_data"}) + .reset_index(drop=True) + ) + hf_well_count_df = pd.concat([hf_well_count_df,hf_well_count_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del hf_well_count_imonth + + # Calculate relative emissions and convert to a geodataframe + hf_well_count_df = calc_enverus_rel_emi(hf_well_count_df) + hf_well_count_df = enverus_df_to_gdf(hf_well_count_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Well Count + ifile_name = get_nei_file_name(nei_data_year, oil_well_count_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + hf_well_count_df = pd.concat([hf_well_count_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = hf_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + hf_well_count_df = hf_well_count_df.astype({'year':str}) + hf_well_count_df.to_parquet(hf_well_count_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_water_prod_proxy.py b/gch4i/proxy_processing/task_oil_water_prod_proxy.py new file mode 100644 index 0000000..c9fb4a2 --- /dev/null +++ b/gch4i/proxy_processing/task_oil_water_prod_proxy.py @@ -0,0 +1,195 @@ +# %% +from pathlib import Path +import os +from typing import Annotated +from zipfile import ZipFile +import calendar +import datetime + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +import shapefile as shp +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, + global_data_dir_path, + sector_data_dir_path, + max_year, + min_year, + years, +) + +from gch4i.utils import us_state_to_abbrev +from gch4i.proxy_processing.ng_oil_production_utils import ( + calc_enverus_rel_emi, + enverus_df_to_gdf, + nei_data_years, + get_nei_file_name, + oil_water_prod_file_names, + get_raw_NEI_data, +) + +# %% +@mark.persist +@task(id="oil_water_prod_proxy") +def task_get_oil_water_prod_proxy_data( + state_path: Path = global_data_dir_path / "tl_2020_us_state.zip", + enverus_production_path: Path = sector_data_dir_path / "enverus/production", + intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs", + nei_path: Path = sector_data_dir_path / "nei_og", + water_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_water_prod_proxy.parquet", + ): + """ + Data come from Enverus, both Drilling Info and Prism + The reason 2 datasets are used is because Prism does not include all states + So remaining states, or those with more DI coverage are taken from DI + + DI: KS, MD, MI, MO, OK, TN + + Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND, + NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA, + SD, TX, UT, VA, WV, WY + + States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH, + NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil + and gas production with an exception for IL and IN. + + *IL and IN do not report to Enverus, but do have oil and gas production. Production + data is taken from the Energy Information Administration (EIA). + + """ + + # Load in State ANSI data + state_gdf = ( + gpd.read_file(state_path) + .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]] + .rename(columns=str.lower) + .rename(columns={"stusps": "state_code", "name": "state_name"}) + .astype({"statefp": int}) + # get only lower 48 + DC + .query("(statefp < 60) & (statefp != 2) & (statefp != 15)") + .reset_index(drop=True) + .to_crs(4326) + ) + + # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year) + # Includes NA Gas Wells and Production onshore in the CONUS region + # source emissions are related to the presence of a well and its production status (no emission if no production) + # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well. + # Wells are not considered active for a given year if there is no production data that year + # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. + # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the + # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status' + # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), + # but the presence of a well will only be included in maps in months where monthly gas prod > 0 + + # Proxy Data Dataframes: + water_prod_df = pd.DataFrame() + + ## Enverus DI and Prism Data: + # Read in and query formatted and corrrected Enverus data to create dictionaries of + # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py) + for iyear in years: + enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv" + enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear) + oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str}) + .query("STATE_CODE.isin(@state_gdf['state_code'])") + .query("OFFSHORE == 'N'") + .query("CUM_OIL > 0") + .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL']) + .assign(year=str(iyear)) + .replace(np.inf, 0) + .astype({"spud_year": str, "first_prod_year": str}) + .query("gas_to_oil_ratio <= 100") + .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'") + ) + + # Include wells in map only for months where there is gas production (emissions ~ when production is occuring) + for imonth in range(1,13): + imonth_str = f"{imonth:02}" # convert to 2-digit months + year_month_str = str(iyear)+'-'+imonth_str + oil_prod_str = 'OILPROD_'+imonth_str + water_prod_str = 'WATERPROD_'+imonth_str + # Onshore data for imonth + oil_data_imonth_temp = (oil_data_temp + .query(f"{oil_prod_str} > 0") + .assign(year_month=str(iyear)+'-'+imonth_str) + ) + oil_data_imonth_temp = (oil_data_imonth_temp[[ + 'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE', + 'HF','WELL_COUNT',oil_prod_str,water_prod_str, + 'comp_year_month','spud_year','first_prod_year']] + ) + # Water Production + # Data Source by state defined in Enverus DrillingInfo Processing - Produced + # Water_2023-11-14_forGridding.xlsx file. + if iyear < 2016: # WV uses NEI data + water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA', + 'MI','MO','MS','MT','ND','NE','NM','NV', + 'NY','OH','SD','TX','UT','VA','WY' + ] + # States using NEI for reference: ['IL','IN','KS','OK','PA','WV'] + else: # 2016 and beyond; WV uses Enverus data + water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA', + 'MI','MO','MS','MT','ND','NE','NM','NV', + 'NY','OH','SD','TX','UT','VA','WY','WV' + ] #WV uses Enverus + # States using NEI for reference: ['IL','IN','KS','OK','PA'] + # Enverus water production for applicable states (NEI water producted will + # be added in the NEI section of the code below) + water_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',water_prod_str]] + .query("STATE_CODE.isin(@water_prod_enverus_states)") + .assign(proxy_data=lambda df: df[water_prod_str]) + .drop(columns=[water_prod_str]) + .rename(columns=lambda x: str(x).lower()) + .reset_index(drop=True) + ) + water_prod_df = pd.concat([water_prod_df,water_prod_imonth]) + + # Delete unused temp data + del oil_data_temp + del oil_data_imonth_temp + del water_prod_imonth + + # Calculate relative emissions and convert to a geodataframe + water_prod_df = calc_enverus_rel_emi(water_prod_df) + water_prod_df = enverus_df_to_gdf(water_prod_df) + + # NEI Data: + nei_df = pd.DataFrame() + + for iyear in years: + nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0] + # Gas Production + ifile_name = get_nei_file_name(nei_data_year, oil_water_prod_file_names) + nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name) + nei_df = pd.concat([nei_df, nei_iyear]) + + # Convert NEI Data to GDF and polygon to centroid point + nei_df = gpd.GeoDataFrame(nei_df, crs=4326) + nei_df = nei_df.to_crs(3857) # projected CRS for centroid calculation + nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid + nei_df = nei_df.to_crs(4326) + + # Add NEI Data to Enverus Data + water_prod_df = pd.concat([water_prod_df, nei_df]).reset_index(drop=True) + + # Delete unused temp data + del nei_iyear + del nei_df + + # Check that relative emissions sum to 1.0 each state/year combination + sums = water_prod_df.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + water_prod_df = water_prod_df.astype({'year':str}) + water_prod_df.to_parquet(water_prod_output_path) + + return None diff --git a/gch4i/proxy_processing/task_oil_well_avg_proxy.py b/gch4i/proxy_processing/task_oil_well_avg_proxy.py new file mode 100644 index 0000000..5336f4f --- /dev/null +++ b/gch4i/proxy_processing/task_oil_well_avg_proxy.py @@ -0,0 +1,69 @@ +# %% +from pathlib import Path +from typing import Annotated + +from pyarrow import parquet +import pandas as pd +import osgeo +import geopandas as gpd +import numpy as np +import seaborn as sns +from pytask import Product, task, mark + +from gch4i.config import ( + V3_DATA_PATH, + proxy_data_dir_path, +) + +# %% +@mark.persist +@task(id="oil_well_avg_proxy") +def task_get_oil_well_avg_proxy_data( + oil_all_well_prod_proxy_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_all_well_prod_proxy.parquet", + oil_well_avg_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_well_avg_proxy.parquet", + ): + """ + This proxy is the weighted average of the well count proxy and the oil production + proxy for all wells. 50% of the relative emission is assigned based on the + individual well's well count, and 50% of the relative emission is assigned based + on the individual well's oil production. + + This file takes the relative emissions based on oil production from the + oil_all_well_prod_proxy, adds in a new relative emission column for well count with + the assumption that each location has WELL_COUNT = 1, and takes the weighted average + of the two relative emission types to create a new average proxy. + + """ + + # Read in the oil production proxy and assign well count + well_avg_gdf = (gpd.read_parquet(oil_all_well_prod_proxy_path) + .rename(columns={'rel_emi': 'prod_rel_emi'}) + .assign(well_count = 1.0) + ) + + # Convert well count into a relative emission where each state-year combination sums to 1 + well_avg_gdf['count_rel_emi'] = well_avg_gdf.groupby(['state_code', 'year'])['well_count'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + well_avg_gdf = well_avg_gdf.drop(columns='well_count') + + # Check that relative emissions sum to 1.0 each state/year combination + prod_sums = well_avg_gdf.groupby(["state_code", "year"])["prod_rel_emi"].sum() # get sums to check normalization + assert np.isclose(prod_sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {prod_sums}" # assert that the sums are close to 1 + + count_sums = well_avg_gdf.groupby(["state_code", "year"])["count_rel_emi"].sum() # get sums to check normalization + assert np.isclose(count_sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {count_sums}" # assert that the sums are close to 1 + + # Create the average relative emission with 50% weights + well_avg_gdf = (well_avg_gdf + .assign(rel_emi=lambda df: 0.5 * (df['prod_rel_emi'] + df['count_rel_emi'])) + .drop(columns={'prod_rel_emi', 'count_rel_emi'}) + ) + + # Check that relative emissions sum to 1.0 each state/year combination + avg_sums = well_avg_gdf.groupby(["state_code", "year"])["rel_emi"].sum() # get sums to check normalization + assert np.isclose(avg_sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {avg_sums}" # assert that the sums are close to 1 + + # Output Proxy Parquet Files + well_avg_gdf = well_avg_gdf.astype({'year': str}) + well_avg_gdf.to_parquet(oil_well_avg_output_path) + + return None