From 81119f4195ee4414211693358532c4e63d2fab98 Mon Sep 17 00:00:00 2001
From: Hannah Lohman <68960449+haclohman@users.noreply.github.com>
Date: Tue, 15 Oct 2024 15:02:52 -0400
Subject: [PATCH 1/6] ng_compressor_stations_proxy

includes gb_stations_proxy, storage_comp_station_proxy, trans_comp_station_proxy
---
 environment.yml                               |   3 +-
 .../task_ng_compressor_stations_proxy.py      | 101 ++++++++++++++++++
 2 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 gch4i/proxy_processing/task_ng_compressor_stations_proxy.py

diff --git a/environment.yml b/environment.yml
index ef183a9..3babe75 100644
--- a/environment.yml
+++ b/environment.yml
@@ -44,7 +44,8 @@ dependencies:
   - pyogrio # use 
   - rioxarray
   - pip
-  - osgeo
+  - gdal
+  # - osgeo
   - pip:
       - -e . # this will install the gch4i package in your environment in dev mode
  
\ No newline at end of file
diff --git a/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py b/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py
new file mode 100644
index 0000000..cc88038
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py
@@ -0,0 +1,101 @@
+# %%
+from pathlib import Path
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+)
+
+from gch4i.utils import us_state_to_abbrev
+
+# %%
+@mark.persist
+@task(id="ng_compressor_stations_proxy")
+def task_get_ng_compressor_stations_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_midstream_ng_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb",
+    gb_stations_output_path: Annotated[Path, Product] = proxy_data_dir_path / "gb_stations_proxy.parquet",
+    storage_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "storage_comp_station_proxy.parquet",
+    trans_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "trans_comp_station_proxy.parquet",
+):
+    """
+    Creation of the following proxies using Enverus Midstream Rextag_Natural_Gas.gdb:
+    - gb_stations_proxy - gathering compressor stations (NG Production)
+    - storage_comp_station_proxy - storage compressor stations (NG Storage)
+    - trans_comp_station_proxy - transmission compressor stations (NG Transmission)
+    """
+
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .to_crs(4326)
+    )
+
+    # Enverus Midstream Natural Gas Compressor Stations
+    compressor_stations_gdf = (gpd.read_file(
+        enverus_midstream_ng_path,
+        layer="CompressorStations",
+        columns=["NAME", "TYPE", "STATUS", "STATE_NAME", "CNTRY_NAME", "geometry"])
+        
+        .query("STATUS == 'Operational'")
+        .query("CNTRY_NAME == 'United States'")
+        .query("STATE_NAME.isin(@state_gdf['state_name'])")
+        .drop(columns=["STATUS", "CNTRY_NAME"])
+        .rename(columns={"NAME": "facility_name",
+                         "TYPE": "type",
+                         "STATE_NAME": "state_name",
+                         })
+        .assign(state_code='NaN')
+        .to_crs(4326)
+        .reset_index(drop=True)
+        )
+    
+    for istation in np.arange(0, len(compressor_stations_gdf)):
+        compressor_stations_gdf.loc[istation, "state_code"] = us_state_to_abbrev(compressor_stations_gdf.loc[istation, "state_name"])
+    
+    # gb_stations_proxy
+    gb_stations_proxy_gdf = (compressor_stations_gdf
+                        .query("type == 'Gathering'")
+                        .drop(columns=["type", "state_name"])
+                        .loc[:, ["facility_name", "state_code", "geometry"]]
+                        .reset_index(drop=True))
+    gb_stations_proxy_gdf.to_parquet(gb_stations_output_path)
+
+    # storage_comp_station_proxy
+    storage_comp_station_proxy_gdf = (compressor_stations_gdf
+                        .query("type == 'Storage'")
+                        .drop(columns=["type", "state_name"])
+                        .loc[:, ["facility_name", "state_code", "geometry"]]
+                        .reset_index(drop=True))
+    storage_comp_station_proxy_gdf.to_parquet(storage_comp_station_output_path)
+
+    # trans_comp_station_proxy
+    trans_comp_station_proxy_gdf = (compressor_stations_gdf
+                        .query("type == 'Transmission'")
+                        .drop(columns=["type", "state_name"])
+                        .loc[:, ["facility_name", "state_code", "geometry"]]
+                        .reset_index(drop=True))
+    trans_comp_station_proxy_gdf.to_parquet(trans_comp_station_output_path)
+
+    return None

From 9463c68bb154112f06ca9eb211f41ca37c0f1180 Mon Sep 17 00:00:00 2001
From: Hannah Lohman <68960449+haclohman@users.noreply.github.com>
Date: Fri, 25 Oct 2024 10:23:29 -0400
Subject: [PATCH 2/6] start ng_production_proxy

start of the natural gas production proxy using enverus prism and di data
---
 environment.yml                               |   2 +-
 .../federal_gom_offshore_proxy.py             | 170 +++++++++
 .../task_ng_production_proxy.py               | 325 ++++++++++++++++++
 3 files changed, 496 insertions(+), 1 deletion(-)
 create mode 100644 gch4i/proxy_processing/federal_gom_offshore_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_production_proxy.py

diff --git a/environment.yml b/environment.yml
index 3babe75..16dfe86 100644
--- a/environment.yml
+++ b/environment.yml
@@ -22,7 +22,7 @@ dependencies:
   # - plotly
   # - pyjanitor
   - pylint
-  # - pyodbc # added by Nathan -- needed in Wastewater
+  - pyodbc # added by Nathan -- needed in Wastewater
   - pyprojroot
   # - pyshp # added by Hannah -- needed in Petroleum Systems
   - python-duckdb
diff --git a/gch4i/proxy_processing/federal_gom_offshore_proxy.py b/gch4i/proxy_processing/federal_gom_offshore_proxy.py
new file mode 100644
index 0000000..92d5b02
--- /dev/null
+++ b/gch4i/proxy_processing/federal_gom_offshore_proxy.py
@@ -0,0 +1,170 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+from pytask import Product, task, mark
+import pyodbc
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+)
+
+from gch4i.utils import us_state_to_abbrev
+
+# %%
+@mark.persist
+@task(id="federal_gom_offshore_proxy")
+def task_get_federal_gom_offshore_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    boem_data_directory_path: Path = sector_data_dir_path / "boem",
+    ng_output_path: Annotated[Path, Product] = proxy_data_dir_path
+    / "federal_gom_offshore_proxy.parquet",
+    oil_output_path: Annotated[Path, Product] = proxy_data_dir_path
+    / "oil_gom_fed_proxy.parquet",
+):
+    """
+    # TODO:
+    """
+
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .to_crs(4326)
+    )
+
+    # get and format boem gom data for 2011, 2014, 2017, and 2021
+    # NOTE: 2011 has tblPointER and tblPointEM but the rest of the years have one single table of data
+    gom_df = {}
+    gom_data_years = ['2011', '2014', '2017', '2021']
+    for idatayear in gom_data_years:
+        gom_file_name = f"{idatayear}_Gulfwide_Platform_Inventory.accdb"
+        gom_file_path = os.path.join(boem_data_directory_path, gom_file_name)
+        driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+gom_file_path+';'''
+        conn = pyodbc.connect(driver_str)
+        GOADS_locations = pd.read_sql("SELECT * FROM tblPointER", conn)
+        GOADS_emissions = pd.read_sql("SELECT * FROM tblPointEM", conn)
+        conn.close()
+                                    
+        # Format Location Data
+        GOADS_locations = GOADS_locations[["strStateFacilityIdentifier","strEmissionReleasePointID","dblXCoordinate","dblYCoordinate"]]
+        #Create platform-by-platform file
+        GOADS_locations_Unique = pd.DataFrame({'strStateFacilityIdentifier':GOADS_locations['strStateFacilityIdentifier'].unique()})
+        GOADS_locations_Unique['lon'] = 0.0
+        GOADS_locations_Unique['lat'] = 0.0
+        GOADS_locations_Unique['strEmissionReleasePointID'] = ''
+
+        for iplatform in np.arange(len(GOADS_locations_Unique)):
+            match_platform = np.where(GOADS_locations['strStateFacilityIdentifier'] == GOADS_locations_Unique['strStateFacilityIdentifier'][iplatform])[0][0]
+            GOADS_locations_Unique.loc[iplatform,'lon',] = GOADS_locations['dblXCoordinate'][match_platform]
+            GOADS_locations_Unique.loc[iplatform,'lat',] = GOADS_locations['dblYCoordinate'][match_platform]
+            GOADS_locations_Unique.loc[iplatform,'strEmissionReleasePointID'] = GOADS_locations['strEmissionReleasePointID'][match_platform][:3]
+
+        GOADS_locations_Unique.reset_index(inplace=True, drop=True)
+        #display(GOADS_locations_Unique)
+
+        #print(GOADS_emissions.columns)
+        #Format Emissions Data (clean lease data string)
+        GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode","dblEmissionNumericValue","BOEM-MONTH",
+                                    "BOEM-LEASE_NUM","BOEM-COMPLEX_ID"]]
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('OCS','')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('-','')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace(' ','')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G1477','G01477')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G73','00073')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G605','00605')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G72','00072')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G599','00599')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G7155','G07155')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G2357','G02357')
+        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G4921','G04921')
+        GOADS_emissions['Emis_tg'] = 0.0
+        GOADS_emissions['Emis_tg'] = 9.0718474E-7 * GOADS_emissions['dblEmissionNumericValue'] #convert short tons to Tg
+        GOADS_emissions = GOADS_emissions[GOADS_emissions['strPollutantCode'] == 'CH4']
+        GOADS_emissions.reset_index(inplace=True, drop=True)
+
+        #display(GOADS_emissions)
+
+        # Use ERG Preprocessed data to determine if major or minor and oil or gas
+        ERG_complex_crosswalk = pd.read_excel(ERG_GOADSEmissions_inputfile, sheet_name = "Complex Emissions by Source", usecols = "AJ:AM", nrows = 11143)
+
+        # add data to map array, for the closest year to 2011
+        year_diff = [abs(x - 2011) for x in year_range]
+        iyear = year_diff.index(min(year_diff))
+
+        #assign oil vs gas by lease/complex ID
+        GOADS_emissions['LEASE_TYPE'] =''
+        GOADS_emissions['MAJOR_STRUC'] =''
+        for istruc in np.arange(0,len(GOADS_emissions)):
+            imatch = np.where(np.logical_and(ERG_complex_crosswalk['BOEM COMPLEX ID.2']==int(GOADS_emissions['BOEM-COMPLEX_ID'][istruc]),\
+                                ERG_complex_crosswalk['Year.2'] == 2011))
+            if np.size(imatch) >0:
+                imatch = imatch[0][0]
+                GOADS_emissions.loc[istruc,'LEASE_TYPE'] = ERG_complex_crosswalk['Oil Gas Defn FINAL.1'][imatch]
+                GOADS_emissions.loc[istruc,'MAJOR_STRUC'] = ERG_complex_crosswalk['Major / Minor.1'][imatch]
+            else:
+                print(istruc, GOADS_emissions['BOEM-COMPLEX_ID'][istruc])
+
+            # for all gas platforms, match the platform to the emissions
+            if GOADS_emissions['LEASE_TYPE'][istruc] =='Gas':
+                match_platform = np.where(GOADS_locations_Unique.strStateFacilityIdentifier==GOADS_emissions['strStateFacilityIdentifier'][istruc])[0][0]
+                ilat = int((GOADS_locations_Unique['lat'][match_platform] - Lat_low)/Res01)
+                ilon = int((GOADS_locations_Unique['lon'][match_platform] - Lon_left)/Res01)
+                imonth = GOADS_emissions['BOEM-MONTH'][istruc]-1 #dict is 1-12, not 0-11
+                if GOADS_emissions['MAJOR_STRUC'][istruc] =='Major':
+                    Map_GOADSmajor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc]
+                else:
+                    Map_GOADSminor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc]
+                
+                
+        # sum complexes and emissions for diagnostic
+        majcplx = GOADS_emissions[(GOADS_emissions['MAJOR_STRUC']=='Major')]
+        majcplx = majcplx[majcplx['LEASE_TYPE'] =='Gas']
+        num_majcplx = majcplx['BOEM-COMPLEX_ID'].unique()
+        #print(np.shape(num_majcplx))
+        mincplx = GOADS_emissions[GOADS_emissions['MAJOR_STRUC']=='Minor']
+        mincplx = mincplx[mincplx['LEASE_TYPE'] =='Gas']
+        num_mincplx = mincplx['BOEM-COMPLEX_ID'].unique()
+        #print(np.size(num_mincplx))            
+        del GOADS_emissions
+        print('Number of Major Gas Complexes: ',(np.size(num_majcplx)))
+        print('Emissions (Tg): ',np.sum(Map_GOADSmajor_emissions[:,:,iyear,:]))
+        print('Number of Minor Gas Complexes: ',(np.size(num_mincplx)))
+        print('Emissions (Tg): ',np.sum(Map_GOADSminor_emissions[:,:,iyear,:]))
+
+    
+    # Create proxy gdf
+    proxy_gdf = (
+    gpd.GeoDataFrame(
+        gb_stations_df,
+        geometry=gpd.points_from_xy(
+            gb_stations_df["lon"],
+            gb_stations_df["lat"],
+            crs=4326,
+        ),
+    )
+    .drop(columns=["lat", "lon"])
+    .loc[:, ["facility_name", "state_code", "geometry"]]
+    )
+
+    proxy_gdf.to_parquet(output_path)
+    return None
diff --git a/gch4i/proxy_processing/task_ng_production_proxy.py b/gch4i/proxy_processing/task_ng_production_proxy.py
new file mode 100644
index 0000000..634bbff
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_production_proxy.py
@@ -0,0 +1,325 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+
+# %%
+@mark.persist
+@task(id="ng_production_proxy")
+def task_get_ng_production_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    nems_region_dict_path: Path = sector_data_dir_path / "enverus/NEMS_Region_Dictionary.xlsx",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx",
+    output_path: Annotated[Path, Product] = proxy_data_dir_path / "gb_stations_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    TODO: Update enverus_well_counts_path with v3 data (currently using v2 data)
+    """
+
+    # STEP 1: Load in State ANSI data and NEMS definitions
+
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make NEMS State classifications
+    # Treat NM and TX separately since these states cover multiple NEMS regions
+
+    # 0 = NE, 1 = MC, 2 = RM, 3 = SW, 4 = WC, 5 = GC, 6 = offshore
+    NEMS_State = pd.read_excel(nems_region_dict_path)
+    NEMS_State = NEMS_State.fillna(0)
+    NM_idx = NEMS_State.index[NEMS_State['State_Name'].str.contains('New Mexico')].tolist()
+    TX_idx = NEMS_State.index[NEMS_State['State_Name'].str.contains('Texas')].tolist()
+    idx = NM_idx+TX_idx
+    NEMS_State= NEMS_State.drop(NEMS_State.index[idx])
+    NEMS_State.reset_index(drop=True,inplace=True)
+
+    NEMS_dict = {'North East':0, 'Midcontinent':1,'Rocky Mountain':2,'South West':3,'West Coast':4,'Gulf Coast':5}
+
+    # STEP 2: Read-in and Format Proxy Data
+
+    # STEP 2.1: State Condensate Data
+
+    # TODO: state condensate data code
+
+    # STEP 2.2: GOADS Emissions Data
+
+    # TODO: GOADS emissions data code
+
+    # STEP 2.3: Well and Production Data (from Enverus)
+    
+    # STEP 2.3.1: Read In & Combine Each Year of Prism & DI Monthly Data (from Enverus)
+
+    # Data come from Enverus, both Drilling Info and Prism
+    # The reason 2 datasets are used is because Prism does not include all states
+    # So remaining states, or those with more DI coverage are taken from DI
+
+    # Read In and Format the Prism and DI data
+    # 1. Read Data
+    # 2. Drop unsed columns, rename columns to match between DI and Prism
+    # 3. Combine DI and Prism into one data array
+    # 4. Calculate annual cummulate production totals
+    # 5. Save the data as a year-specific variable
+
+    # Based on ERGs logic, active wells are determined based on their production levels and not producing status
+    Enverus_data_dict = {}
+    for iyear in years:
+        #DI data
+        DI_file_name = f"didsk_monthly_{iyear}.csv"
+        DI_file_path = os.path.join(enverus_production_path, DI_file_name)
+        DI_data = (pd.read_csv(
+            DI_file_path,
+            usecols=['WELL_COUNT_ID','STATE','COUNTY','BASIN','AAPG_CODE_ERG',
+                     'NEMS_REGION_ERG','LATITUDE','LONGITUDE','STATUS','COMPDATE',
+                     'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR',
+                     'GOR_QUAL','PROD_FLAG','PRODYEAR',
+                     'LIQ_01','GAS_01','WTR_01','LIQ_02','GAS_02','WTR_02',
+                     'LIQ_03','GAS_03','WTR_03','LIQ_04','GAS_04','WTR_04',
+                     'LIQ_05','GAS_05','WTR_05','LIQ_06','GAS_06','WTR_06',
+                     'LIQ_07','GAS_07','WTR_07','LIQ_08','GAS_08','WTR_08',
+                     'LIQ_09','GAS_09','WTR_09','LIQ_10','GAS_10','WTR_10',
+                     'LIQ_11','GAS_11','WTR_11','LIQ_12','GAS_12','WTR_12',],
+            dtype={7:'str'})
+            .rename(columns={'WELL_COUNT_ID':'WELL_COUNT','STATE':'STATE_CODE',
+                             'NEMS_REGION_ERG':'NEMS_REGION', 'STATUS':'PRODUCING_STATUS',
+                             'LIQ_01':'OILPROD_01','GAS_01':'GASPROD_01','WTR_01':'WATERPROD_01',
+                             'LIQ_02':'OILPROD_02','GAS_02':'GASPROD_02','WTR_02':'WATERPROD_02',
+                             'LIQ_03':'OILPROD_03','GAS_03':'GASPROD_03','WTR_03':'WATERPROD_03',
+                             'LIQ_04':'OILPROD_04','GAS_04':'GASPROD_04','WTR_04':'WATERPROD_04',
+                             'LIQ_05':'OILPROD_05','GAS_05':'GASPROD_05','WTR_05':'WATERPROD_05',
+                             'LIQ_06':'OILPROD_06','GAS_06':'GASPROD_06','WTR_06':'WATERPROD_06',
+                             'LIQ_07':'OILPROD_07','GAS_07':'GASPROD_07','WTR_07':'WATERPROD_07',
+                             'LIQ_08':'OILPROD_08','GAS_08':'GASPROD_08','WTR_08':'WATERPROD_08',
+                             'LIQ_09':'OILPROD_09','GAS_09':'GASPROD_09','WTR_09':'WATERPROD_09',
+                             'LIQ_10':'OILPROD_10','GAS_10':'GASPROD_10','WTR_10':'WATERPROD_10',
+                             'LIQ_11':'OILPROD_11','GAS_11':'GASPROD_11','WTR_11':'WATERPROD_11',
+                             'LIQ_12':'OILPROD_12','GAS_12':'GASPROD_12','WTR_12':'WATERPROD_12',})
+            .assign(WELL_COUNT=1)
+            )
+
+        # Prism Data
+        Prism_file_name = f"prism_monthly_{iyear}.csv"
+        Prism_file_path = os.path.join(enverus_production_path, Prism_file_name)
+        Prism_data = (pd.read_csv(
+            Prism_file_path,
+            usecols=['STATE','COUNTY','ENVBASIN','AAPG_CODE_ERG',
+                     'NEMS_REGION_ERG','LATITUDE','LONGITUDE','ENVWELLSTATUS','COMPLETIONDATE',
+                     'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR',
+                     'GOR_QUAL','PROD_FLAG','PRODYEAR',
+                     'LIQUIDSPROD_BBL_01','GASPROD_MCF_01','WATERPROD_BBL_01',
+                     'LIQUIDSPROD_BBL_02','GASPROD_MCF_02','WATERPROD_BBL_02',
+                     'LIQUIDSPROD_BBL_03','GASPROD_MCF_03','WATERPROD_BBL_03',
+                     'LIQUIDSPROD_BBL_04','GASPROD_MCF_04','WATERPROD_BBL_04',
+                     'LIQUIDSPROD_BBL_05','GASPROD_MCF_05','WATERPROD_BBL_05',
+                     'LIQUIDSPROD_BBL_06','GASPROD_MCF_06','WATERPROD_BBL_06',
+                     'LIQUIDSPROD_BBL_07','GASPROD_MCF_07','WATERPROD_BBL_07',
+                     'LIQUIDSPROD_BBL_08','GASPROD_MCF_08','WATERPROD_BBL_08',
+                     'LIQUIDSPROD_BBL_09','GASPROD_MCF_09','WATERPROD_BBL_09',
+                     'LIQUIDSPROD_BBL_10','GASPROD_MCF_10','WATERPROD_BBL_10',
+                     'LIQUIDSPROD_BBL_11','GASPROD_MCF_11','WATERPROD_BBL_11',
+                     'LIQUIDSPROD_BBL_12','GASPROD_MCF_12','WATERPROD_BBL_12',],
+            dtype={7:'str'})
+            .rename(columns={'STATE':'STATE_CODE', 'ENVBASIN':'BASIN',
+                             'NEMS_REGION_ERG':'NEMS_REGION', 'ENVWELLSTATUS':'PRODUCING_STATUS',
+                             'COMPLETIONDATE':'COMPDATE',
+                             'LIQUIDSPROD_BBL_01':'OILPROD_01','GASPROD_MCF_01':'GASPROD_01','WATERPROD_BBL_01':'WATERPROD_01',
+                             'LIQUIDSPROD_BBL_02':'OILPROD_02','GASPROD_MCF_02':'GASPROD_02','WATERPROD_BBL_02':'WATERPROD_02',
+                             'LIQUIDSPROD_BBL_03':'OILPROD_03','GASPROD_MCF_03':'GASPROD_03','WATERPROD_BBL_03':'WATERPROD_03',
+                             'LIQUIDSPROD_BBL_04':'OILPROD_04','GASPROD_MCF_04':'GASPROD_04','WATERPROD_BBL_04':'WATERPROD_04',
+                             'LIQUIDSPROD_BBL_05':'OILPROD_05','GASPROD_MCF_05':'GASPROD_05','WATERPROD_BBL_05':'WATERPROD_05',
+                             'LIQUIDSPROD_BBL_06':'OILPROD_06','GASPROD_MCF_06':'GASPROD_06','WATERPROD_BBL_06':'WATERPROD_06',
+                             'LIQUIDSPROD_BBL_07':'OILPROD_07','GASPROD_MCF_07':'GASPROD_07','WATERPROD_BBL_07':'WATERPROD_07',
+                             'LIQUIDSPROD_BBL_08':'OILPROD_08','GASPROD_MCF_08':'GASPROD_08','WATERPROD_BBL_08':'WATERPROD_08',
+                             'LIQUIDSPROD_BBL_09':'OILPROD_09','GASPROD_MCF_09':'GASPROD_09','WATERPROD_BBL_09':'WATERPROD_09',
+                             'LIQUIDSPROD_BBL_10':'OILPROD_10','GASPROD_MCF_10':'GASPROD_10','WATERPROD_BBL_10':'WATERPROD_10',
+                             'LIQUIDSPROD_BBL_11':'OILPROD_11','GASPROD_MCF_11':'GASPROD_11','WATERPROD_BBL_11':'WATERPROD_11',
+                             'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12',})
+            .assign(WELL_COUNT=1)
+            )
+        
+        # Combine into one array with common column names, replace nans with zeros, and sum annual production
+        Enverus_data = pd.concat([DI_data, Prism_data], ignore_index=True)
+        Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')].fillna(0)
+        Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')].fillna(0)
+        Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')].fillna(0)
+
+        # Calculate cummulative annual production totals for Gas, Oil, Water
+        Enverus_data['CUM_GAS'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('GASPROD_')].sum(1)
+        Enverus_data['CUM_OIL'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('OILPROD_')].sum(1)
+        Enverus_data['CUM_WATER'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('WATERPROD_')].sum(1)
+        
+        Enverus_data['NEMS_CODE'] = Enverus_data['NEMS_REGION'].map(NEMS_dict)
+        
+        # Save out the data for that year
+        Enverus_data_dict[f'{iyear}'] = Enverus_data
+
+        del Prism_data
+        del DI_data #save memory space 
+        
+        #define default values for a new row in this table (to be used later during data corrections)
+        default = {'WELL_COUNT': 0, 'STATE_CODE':'','COUNTY':'','NEMS_REGION':'UNK',
+                   'AAPG_CODE_ERG':'UNK','LATITUDE':0,'LONGITUDE':0,
+                   'PRODUCING_STATUS':'','BASIN':'','SPUDDATE':'','COMPDATE':'',
+                   'FIRSTPRODDATE':'','HF':'', 'OFFSHORE':'','GOR':-99,
+                   'GOR_QUAL':'','PROD_FLAG':'','PRODYEAR':'',
+                   'OILPROD_01':0, 'GASPROD_01':0, 'WATERPROD_01':0,'OILPROD_02':0, 'GASPROD_02':0, 'WATERPROD_02':0,
+                   'OILPROD_03':0, 'GASPROD_03':0, 'WATERPROD_03':0,'OILPROD_04':0, 'GASPROD_04':0, 'WATERPROD_04':0,\
+                   'OILPROD_05':0, 'GASPROD_05':0, 'WATERPROD_05':0,'OILPROD_06':0, 'GASPROD_06':0, 'WATERPROD_06':0,\
+                   'OILPROD_07':0, 'GASPROD_07':0, 'WATERPROD_07':0,'OILPROD_08':0, 'GASPROD_08':0, 'WATERPROD_08':0,\
+                   'OILPROD_09':0, 'GASPROD_09':0, 'WATERPROD_09':0,'OILPROD_10':0, 'GASPROD_10':0, 'WATERPROD_10':0,\
+                   'OILPROD_11':0, 'GASPROD_11':0, 'WATERPROD_11':0,'OILPROD_12':0, 'GASPROD_12':0, 'WATERPROD_12':0,
+                   'CUM_GAS':0, 'CUM_OIL':0, 'CUM_WATER':0, 'NEMS_CODE':99}
+
+    # Correct the NEMS Code for missing NEMS_REGIONS
+    # Note OFFSHORE regions will have NaN as NEMS_Code
+    for iyear in years:
+        enverus_data_temp = Enverus_data_dict[f'{iyear}']
+        list_well = enverus_data_temp.index[pd.isna(enverus_data_temp.loc[:,'NEMS_REGION'])].tolist()
+        if np.size(list_well) > 0:
+            for irow in list_well: 
+                match_state = np.where(NEMS_State['State_Code']==enverus_data_temp['STATE_CODE'][irow])[0][0]
+                enverus_data_temp.loc[irow,'NEMS_CODE'] = NEMS_State['NEMS'][match_state].astype(int)
+        Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy()
+
+    # STEP 2.3.2: Correct Enverus Data for Select States
+
+    # 1) Read In Coverage Table from State Well Counts File from ERG
+    # (specifies the first year with bad data and which years need to be corrected; 
+    # all years including and after the first bad year of data need to be corrected)
+
+    ERG_StateWellCounts_LastGoodDataYear = (pd.read_excel(
+        enverus_well_counts_path,
+        sheet_name = "2021 - Coverage",
+        usecols = {"State","Last Good Year"},
+        skiprows = 2,
+        nrows = 40)
+        )
+
+    # 2) Loops through the each state and year in Enverus to determine if the data for that particualar year needs to 
+    # be corrected. At the moment, the only corrections ERG makes to the data is to use the prior year of data if there
+    # is no new Enverus data reportd for that state. If a particular state is not included for any years in the Enverus
+    # dataset, then a row of zeros is added to the Enverus table for that year.
+         
+    for istate in np.arange(0,len(state_gdf)):
+        correctdata =0
+        istate_code = state_gdf['state_code'][istate]
+        lastgoodyear = ERG_StateWellCounts_LastGoodDataYear['Last Good Year'][ERG_StateWellCounts_LastGoodDataYear['State'] == istate_code].values
+        if lastgoodyear  == max_year:
+            lastgoodyear = max_year+5 #if state isn't included in correction list, don't correct any data
+        
+        for iyear in years:
+            enverus_data_temp= Enverus_data_dict[f'{iyear}'].copy()
+            state_list = np.unique(enverus_data_temp['STATE_CODE'])
+            if istate_code in state_list:
+                inlist =1
+            else:
+                inlist = 0
+            if inlist ==1 or correctdata==1: #if the state is included in Enverus data, or had data for at least one good year
+                #if first year, correctdata will be zero, but inlist will also be zero if no Enverus data
+                #check to see whether corrections are necessary for the given year/state
+                if iyear == (lastgoodyear):
+                    print(istate_code,iyear,'last good year')
+                    # This is the last year of good data. Do not correct the data but save
+                    # but so that this data can be used for all following years for that state
+                    temp_data = enverus_data_temp[enverus_data_temp['STATE_CODE'] == istate_code]
+                    correctdata=1
+                elif iyear > lastgoodyear: 
+                    print(istate_code,iyear)
+                    #correct data for all years equal to and after the first bad year (remove old data first if necessary)
+                    if inlist == 1:
+                        enverus_data_temp = enverus_data_temp[enverus_data_temp['STATE_CODE'] != istate_code]
+                    enverus_data_temp = pd.concat([enverus_data_temp,temp_data],ignore_index=True)
+                    print(istate_code +' data for ' +str(iyear) +' were corrected with '+str(lastgoodyear)+' data')
+                else:
+                    # year_range[iyear] < firstbadyear:
+                    #no data corrections if the current year is before the first bad year
+                    #print('no corrections')
+                    #print(state_str,year_range[iyear])
+                    no_corrections =1
+                    
+            if inlist==0 and correctdata==0:
+            #if there is no Enverus data for a given state, and there was no good data, add a row with default values
+                # temp_row = {'STATE':istate_code}
+                # enverus_data_temp = enverus_data_temp.append({**default,**temp_row}, ignore_index=True)
+                print(istate_code +' has no Enverus data in the year ' +str(iyear))
+                
+            #resave that year of Enverus data
+            enverus_data_temp.reset_index(drop=True,inplace=True)
+            Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy()
+
+    # STEP 2.4: Calculate Fractional Monthly Condensate Arrays
+    # (EIA condensate production (bbl) relative to producing Enverus gas wells by month
+    # in each state and region)
+
+    # TODO: fractional monthly condensate array code
+
+    # STEP 2.5: Convert Enverus Well Production Arrays and Condensate Array into Gridded
+    # Location Arrays
+
+    # clear variables
+    # del ERG_StateWellCounts_FirstBadDataYear
+    # del Prism_data
+    # del colnames
+    # del names
+    # del state_condensates
+    # del temp_data
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are completed but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the prsence of a well will only be included in maps in months where monthly gas prod > 

From d092b2351f2746293c2b2048313239146814a038 Mon Sep 17 00:00:00 2001
From: Hannah Lohman <68960449+haclohman@users.noreply.github.com>
Date: Mon, 2 Dec 2024 19:30:41 -0500
Subject: [PATCH 3/6] natural gas proxies

---
 .../federal_gom_offshore_proxy.py             | 105 ++-
 .../task_ng_compressor_stations_proxy.py      |   6 +-
 .../task_ng_production_proxy.py               | 688 +++++++++++++++++-
 3 files changed, 792 insertions(+), 7 deletions(-)

diff --git a/gch4i/proxy_processing/federal_gom_offshore_proxy.py b/gch4i/proxy_processing/federal_gom_offshore_proxy.py
index 92d5b02..3eb351d 100644
--- a/gch4i/proxy_processing/federal_gom_offshore_proxy.py
+++ b/gch4i/proxy_processing/federal_gom_offshore_proxy.py
@@ -49,12 +49,115 @@ def task_get_federal_gom_offshore_proxy_data(
         .astype({"statefp": int})
         # get only lower 48 + DC
         .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
         .to_crs(4326)
     )
 
     # get and format boem gom data for 2011, 2014, 2017, and 2021
     # NOTE: 2011 has tblPointER and tblPointEM but the rest of the years have one single table of data
-    gom_df = {}
+    gom_df = pd.DataFrame()
+
+    # 2011 GOADS Data
+
+    # Read In and Format 2011 BEOM Data
+    gom_file_name = f"2011_Gulfwide_Platform_Inventory.accdb"
+    gom_file_path = os.path.join(boem_data_directory_path, gom_file_name)
+    driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+gom_file_path+';'''
+    conn = pyodbc.connect(driver_str)
+    GOADS_locations = pd.read_sql("SELECT * FROM tblPointER", conn)
+    GOADS_emissions = pd.read_sql("SELECT * FROM tblPointEM", conn)
+    conn.close()
+
+    # Format Location Data
+    GOADS_locations = GOADS_locations[["strStateFacilityIdentifier","strEmissionReleasePointID","dblXCoordinate","dblYCoordinate"]]
+    #Create platform-by-platform file
+    GOADS_locations_Unique = pd.DataFrame({'strStateFacilityIdentifier':GOADS_locations['strStateFacilityIdentifier'].unique()})
+    GOADS_locations_Unique['lon'] = 0.0
+    GOADS_locations_Unique['lat'] = 0.0
+    GOADS_locations_Unique['strEmissionReleasePointID'] = ''
+
+    for iplatform in np.arange(len(GOADS_locations_Unique)):
+        match_platform = np.where(GOADS_locations['strStateFacilityIdentifier'] == GOADS_locations_Unique['strStateFacilityIdentifier'][iplatform])[0][0]
+        GOADS_locations_Unique.loc[iplatform,'lon',] = GOADS_locations['dblXCoordinate'][match_platform]
+        GOADS_locations_Unique.loc[iplatform,'lat',] = GOADS_locations['dblYCoordinate'][match_platform]
+        GOADS_locations_Unique.loc[iplatform,'strEmissionReleasePointID'] = GOADS_locations['strEmissionReleasePointID'][match_platform][:3]
+
+    GOADS_locations_Unique.reset_index(inplace=True, drop=True)
+    #display(GOADS_locations_Unique)
+
+    #print(GOADS_emissions.columns)
+    #Format Emissions Data (clean lease data string)
+    GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode","dblEmissionNumericValue","BOEM-MONTH",
+                                  "BOEM-LEASE_NUM","BOEM-COMPLEX_ID"]]
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('OCS','')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('-','')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace(' ','')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G1477','G01477')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G73','00073')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G605','00605')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G72','00072')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G599','00599')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G7155','G07155')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G2357','G02357')
+    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G4921','G04921')
+    GOADS_emissions['Emis_tg'] = 0.0
+    GOADS_emissions['Emis_tg'] = 9.0718474E-7 * GOADS_emissions['dblEmissionNumericValue'] #convert short tons to Tg
+    GOADS_emissions = GOADS_emissions[GOADS_emissions['strPollutantCode'] == 'CH4']
+    GOADS_emissions.reset_index(inplace=True, drop=True)
+
+    #display(GOADS_emissions)
+
+    # Use ERG Preprocessed data to determine if major or minor and oil or gas
+    ERG_complex_crosswalk = pd.read_excel(ERG_GOADSEmissions_inputfile, sheet_name = "Complex Emissions by Source", usecols = "AJ:AM", nrows = 11143)
+    #display(ERG_complex_crosswalk)
+
+    # add data to map array, for the closest year to 2011
+    year_diff = [abs(x - 2011) for x in year_range]
+    iyear = year_diff.index(min(year_diff))
+
+    #assign oil vs gas by lease/complex ID
+    GOADS_emissions['LEASE_TYPE'] =''
+    GOADS_emissions['MAJOR_STRUC'] =''
+    for istruc in np.arange(0,len(GOADS_emissions)):
+        imatch = np.where(np.logical_and(ERG_complex_crosswalk['BOEM COMPLEX ID.2']==int(GOADS_emissions['BOEM-COMPLEX_ID'][istruc]),\
+                            ERG_complex_crosswalk['Year.2'] == 2011))
+        if np.size(imatch) >0:
+            imatch = imatch[0][0]
+            GOADS_emissions.loc[istruc,'LEASE_TYPE'] = ERG_complex_crosswalk['Oil Gas Defn FINAL.1'][imatch]
+            GOADS_emissions.loc[istruc,'MAJOR_STRUC'] = ERG_complex_crosswalk['Major / Minor.1'][imatch]
+        else:
+            print(istruc, GOADS_emissions['BOEM-COMPLEX_ID'][istruc])
+
+        # for all gas platforms, match the platform to the emissions
+        if GOADS_emissions['LEASE_TYPE'][istruc] =='Gas':
+            match_platform = np.where(GOADS_locations_Unique.strStateFacilityIdentifier==GOADS_emissions['strStateFacilityIdentifier'][istruc])[0][0]
+            ilat = int((GOADS_locations_Unique['lat'][match_platform] - Lat_low)/Res01)
+            ilon = int((GOADS_locations_Unique['lon'][match_platform] - Lon_left)/Res01)
+            imonth = GOADS_emissions['BOEM-MONTH'][istruc]-1 #dict is 1-12, not 0-11
+            if GOADS_emissions['MAJOR_STRUC'][istruc] =='Major':
+                Map_GOADSmajor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc]
+            else:
+                Map_GOADSminor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc]
+            
+            
+    # sum complexes and emissions for diagnostic
+    majcplx = GOADS_emissions[(GOADS_emissions['MAJOR_STRUC']=='Major')]
+    majcplx = majcplx[majcplx['LEASE_TYPE'] =='Gas']
+    num_majcplx = majcplx['BOEM-COMPLEX_ID'].unique()
+    #print(np.shape(num_majcplx))
+    mincplx = GOADS_emissions[GOADS_emissions['MAJOR_STRUC']=='Minor']
+    mincplx = mincplx[mincplx['LEASE_TYPE'] =='Gas']
+    num_mincplx = mincplx['BOEM-COMPLEX_ID'].unique()
+    #print(np.size(num_mincplx))            
+    del GOADS_emissions
+    print('Number of Major Gas Complexes: ',(np.size(num_majcplx)))
+    print('Emissions (Tg): ',np.sum(Map_GOADSmajor_emissions[:,:,iyear,:]))
+    print('Number of Minor Gas Complexes: ',(np.size(num_mincplx)))
+    print('Emissions (Tg): ',np.sum(Map_GOADSminor_emissions[:,:,iyear,:]))
+
+
+
+
     gom_data_years = ['2011', '2014', '2017', '2021']
     for idatayear in gom_data_years:
         gom_file_name = f"{idatayear}_Gulfwide_Platform_Inventory.accdb"
diff --git a/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py b/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py
index cc88038..e0d4bb7 100644
--- a/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py
+++ b/gch4i/proxy_processing/task_ng_compressor_stations_proxy.py
@@ -30,9 +30,9 @@
 def task_get_ng_compressor_stations_proxy_data(
     state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
     enverus_midstream_ng_path: Path = sector_data_dir_path / "enverus/midstream/Rextag_Natural_Gas.gdb",
-    gb_stations_output_path: Annotated[Path, Product] = proxy_data_dir_path / "gb_stations_proxy.parquet",
-    storage_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "storage_comp_station_proxy.parquet",
-    trans_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "trans_comp_station_proxy.parquet",
+    gb_stations_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_gb_stations_proxy.parquet",
+    storage_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_storage_comp_station_proxy.parquet",
+    trans_comp_station_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_trans_comp_station_proxy.parquet",
 ):
     """
     Creation of the following proxies using Enverus Midstream Rextag_Natural_Gas.gdb:
diff --git a/gch4i/proxy_processing/task_ng_production_proxy.py b/gch4i/proxy_processing/task_ng_production_proxy.py
index 634bbff..09fff7d 100644
--- a/gch4i/proxy_processing/task_ng_production_proxy.py
+++ b/gch4i/proxy_processing/task_ng_production_proxy.py
@@ -12,6 +12,7 @@
 import geopandas as gpd
 import numpy as np
 import seaborn as sns
+import shapefile as shp
 from pytask import Product, task, mark
 
 from gch4i.config import (
@@ -34,7 +35,21 @@ def task_get_ng_production_proxy_data(
     nems_region_dict_path: Path = sector_data_dir_path / "enverus/NEMS_Region_Dictionary.xlsx",
     enverus_production_path: Path = sector_data_dir_path / "enverus/production",
     enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx",
-    output_path: Annotated[Path, Product] = proxy_data_dir_path / "gb_stations_proxy.parquet",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    all_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_count_proxy.parquet",
+    conv_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_count_proxy.parquet",
+    hf_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_count_proxy.parquet",
+    all_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_prod_proxy.parquet",
+    basin_220_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_220_prod_proxy.parquet",
+    basin_395_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_395_prod_proxy.parquet",
+    basin_430_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_430_prod_proxy.parquet",
+    basin_other_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_other_prod_proxy.parquet",
+    water_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_water_prod_proxy.parquet",
+    conv_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_comp_proxy.parquet",
+    hf_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_comp_proxy.parquet",
+    drilled_well_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_drilled_well_proxy.parquet",
+    state_gom_offshore_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_count_proxy.parquet",
+    state_gom_offshore_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_prod_proxy.parquet",
     ):
     """
     Data come from Enverus, both Drilling Info and Prism
@@ -57,6 +72,13 @@ def task_get_ng_production_proxy_data(
     TODO: Update enverus_well_counts_path with v3 data (currently using v2 data)
     """
 
+    # Functions:
+    # Define safe devide to set result to zero if denominator is zero
+    def safe_div(x,y):
+        if y == 0:
+            return 0
+        return x / y
+
     # STEP 1: Load in State ANSI data and NEMS definitions
 
     state_gdf = (
@@ -112,6 +134,8 @@ def task_get_ng_production_proxy_data(
 
     # Based on ERGs logic, active wells are determined based on their production levels and not producing status
     Enverus_data_dict = {}
+    DI_data_dict = {}
+    Prism_data_dict = {}
     for iyear in years:
         #DI data
         DI_file_name = f"didsk_monthly_{iyear}.csv"
@@ -143,8 +167,43 @@ def task_get_ng_production_proxy_data(
                              'LIQ_10':'OILPROD_10','GAS_10':'GASPROD_10','WTR_10':'WATERPROD_10',
                              'LIQ_11':'OILPROD_11','GAS_11':'GASPROD_11','WTR_11':'WATERPROD_11',
                              'LIQ_12':'OILPROD_12','GAS_12':'GASPROD_12','WTR_12':'WATERPROD_12',})
-            .assign(WELL_COUNT=1)
+            .assign(WELL_COUNT=1)  # TODO: Check to see if this should actually be set to 1
             )
+        # Format completion date (YYYY-MM)
+        for iwell in range(0,len(DI_data)):
+            comp_date = str(DI_data.loc[iwell, 'COMPDATE'])
+            if comp_date == 'NaN':
+                comp_year_month = 'NaN'
+            elif comp_date == 'nan':
+                comp_year_month = 'NaN'
+            else:  # date format M/DD/YYYY
+                comp_month = f"{int(comp_date.split('/')[0]):02}"
+                comp_year = f"{int(comp_date.split('/')[2])}"
+                comp_year_month = str(comp_year)+'-'+str(comp_month)
+            DI_data.loc[iwell, 'comp_year_month'] = comp_year_month
+        # Format spud date (YYYY)
+        for iwell in range(0,len(DI_data)):
+            spud_date = str(DI_data.loc[iwell, 'SPUDDATE'])
+            if spud_date == 'NaN':
+                spud_year = 'NaN'
+            elif spud_date == 'nan':
+                spud_year = 'NaN'
+            else:  # date format M/DD/YYYY
+                spud_year = f"{int(spud_date.split('/')[2])}"
+                spud_year = str(spud_year)
+            DI_data.loc[iwell, 'spud_year'] = spud_year
+        # Format first production date (YYYY)
+        for iwell in range(0,len(DI_data)):
+            first_prod_date = str(DI_data.loc[iwell, 'FIRSTPRODDATE'])
+            if first_prod_date == 'NaN':
+                first_prod_year = 'NaN'
+            elif first_prod_date == 'nan':
+                first_prod_year = 'NaN'
+            else:  # date format M/DD/YYYY
+                first_prod_year = f"{int(first_prod_date.split('/')[2])}"
+                first_prod_year = str(first_prod_year)
+            DI_data.loc[iwell, 'first_prod_year'] = first_prod_year
+        DI_data_dict[f'{iyear}'] = DI_data
 
         # Prism Data
         Prism_file_name = f"prism_monthly_{iyear}.csv"
@@ -185,6 +244,41 @@ def task_get_ng_production_proxy_data(
                              'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12',})
             .assign(WELL_COUNT=1)
             )
+        # Format completion date (YYYY-MM)
+        for iwell in range(0,len(Prism_data)):
+            comp_date = str(Prism_data.loc[iwell, 'COMPDATE'])
+            if comp_date == 'NaN':
+                comp_year_month = 'NaN'
+            elif comp_date == 'nan':
+                comp_year_month = 'NaN'
+            else:  # date format YYYY-MM-DD
+                comp_month = f"{int(comp_date.split('-')[1]):02}"
+                comp_year = f"{int(comp_date.split('-')[0])}"
+                comp_year_month = str(comp_year)+'-'+str(comp_month)
+            Prism_data.loc[iwell, 'comp_year_month'] = comp_year_month
+        # Format spud date (YYYY)
+        for iwell in range(0,len(Prism_data)):
+            spud_date = str(Prism_data.loc[iwell, 'SPUDDATE'])
+            if spud_date == 'NaN':
+                spud_year = 'NaN'
+            elif spud_date == 'nan':
+                spud_year = 'NaN'
+            else:  # date format YYYY-MM-DD
+                spud_year = f"{int(spud_date.split('-')[0])}"
+                spud_year = str(spud_year)
+            Prism_data.loc[iwell, 'spud_year'] = spud_year
+        # Format first production date (YYYY)
+        for iwell in range(0,len(Prism_data)):
+            first_prod_date = str(Prism_data.loc[iwell, 'FIRSTPRODDATE'])
+            if first_prod_date == 'NaN':
+                first_prod_year = 'NaN'
+            elif first_prod_date == 'nan':
+                first_prod_year = 'NaN'
+            else:  # date format YYYY-MM-DD
+                first_prod_year = f"{int(first_prod_date.split('-')[0])}"
+                first_prod_year = str(first_prod_year)
+            Prism_data.loc[iwell, 'first_prod_year'] = first_prod_year
+        Prism_data_dict[f'{iyear}'] = Prism_data
         
         # Combine into one array with common column names, replace nans with zeros, and sum annual production
         Enverus_data = pd.concat([DI_data, Prism_data], ignore_index=True)
@@ -322,4 +416,592 @@ def task_get_ng_production_proxy_data(
     # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
     # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
     # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
-    # but the prsence of a well will only be included in maps in months where monthly gas prod > 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    # Well Counts
+    all_well_count_df = pd.DataFrame()  # Active gas well (conventional + HF) counts in a given month
+    conv_well_count_df = pd.DataFrame()  # Active conventional gas well counts in a given month
+    hf_well_count_df = pd.DataFrame()  # Active HF gas well counts in a given month
+    # Well-Level Production Volumes
+    all_well_prod_df = pd.DataFrame()  # Active gas well (conventional + HF) gas production in a given month
+    basin_220_prod_df = pd.DataFrame()  # Gas well gas production in Basin 220 in a given month
+    basin_395_prod_df = pd.DataFrame()  # Gas well gas production in Basin 395 in a given month
+    basin_430_prod_df = pd.DataFrame()  # Gas well gas production in Basin 430 in a given month
+    basin_other_prod_df = pd.DataFrame()  # Gas well gas production in Other Basins in a given month
+    # Water Production Volumes
+    water_prod_df = pd.DataFrame()
+    # Well Completions
+    conv_well_comp_df = pd.DataFrame()  # Conventional gas well completions
+    hf_well_comp_df = pd.DataFrame()  # HF gas well completions
+    # Drilled Gas Wells
+    drilled_well_df = pd.DataFrame()  # Gas wells drilled
+    # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
+    state_gom_offshore_well_count_df = pd.DataFrame()  # Offshore state GOM gas well counts
+    state_gom_offshore_well_prod_df = pd.DataFrame()  # Offshore state GOM gas production
+
+
+    # Query Enverus data to create dictionaries of proxy data
+    for iyear in years:
+        enverus_data_temp = Enverus_data_dict[f'{iyear}'].copy()
+        
+        # Onshore Natural Gas
+        ng_data_temp = (enverus_data_temp
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+        # Offshore Natural Gas Wells
+        ng_offshore_data_temp = (enverus_data_temp
+                                 .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                                 .query("OFFSHORE == 'Y'")
+                                 .query("CUM_GAS > 0")
+                                 .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                                 .assign(year=str(iyear))
+                                 .replace(np.inf, 0)
+                                 .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                                 )
+        
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            water_prod_str = 'WATERPROD_'+imonth_str
+            # onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,water_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # offshore data for imonth
+            ng_offshore_data_imonth_temp = (ng_offshore_data_temp
+                                            .query(f"{prod_str} > 0")
+                                            .assign(year_month=str(iyear)+'-'+imonth_str)
+                                            )
+            ng_data_imonth_temp = (ng_offshore_data_imonth_temp[[
+                'year','year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,water_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Well Counts
+            # All Gas Well Count
+            all_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']]
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .rename(columns={"well_count":"proxy_data"})
+                                    .reset_index(drop=True)
+                                    )
+            all_well_count_df = pd.concat([all_well_count_df,all_well_count_imonth])
+            # Conventional Gas Well Count
+            conv_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']]
+                                     .query("HF != 'Y'")
+                                     .drop(columns=["HF"])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .rename(columns={"well_count":"proxy_data"})
+                                     .reset_index(drop=True)
+                                     )
+            conv_well_count_df = pd.concat([conv_well_count_df,conv_well_count_imonth])
+            # HF Gas Well Count
+            hf_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']]
+                                   .query("HF == 'Y'")
+                                   .drop(columns=["HF"])
+                                   .rename(columns=lambda x: str(x).lower())
+                                   .rename(columns={"well_count":"proxy_data"})
+                                   .reset_index(drop=True)
+                                   )
+            hf_well_count_df = pd.concat([hf_well_count_df,hf_well_count_imonth])
+
+            # Gas Production
+            # All Gas Well Gas Production
+            all_well_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]]
+                                   .assign(proxy_data=lambda df: df[gas_prod_str])
+                                   .drop(columns=[gas_prod_str])
+                                   .rename(columns=lambda x: str(x).lower())
+                                   .reset_index(drop=True)
+                                   )
+            all_well_prod_df = pd.concat([all_well_prod_df,all_well_prod_imonth])
+            # Basin 220 Gas Well Gas Production
+            basin_220_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
+                                    .query("AAPG_CODE_ERG == '220'")
+                                    .assign(proxy_data=lambda df: df[gas_prod_str])
+                                    .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .reset_index(drop=True)
+                                    )
+            basin_220_prod_df = pd.concat([basin_220_prod_df,basin_220_prod_imonth])
+            # Basin 395 Gas Well Gas Production
+            basin_395_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
+                                    .query("AAPG_CODE_ERG == '395'")
+                                    .assign(proxy_data=lambda df: df[gas_prod_str])
+                                    .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .reset_index(drop=True)
+                                    )
+            basin_395_prod_df = pd.concat([basin_395_prod_df,basin_395_prod_imonth])
+            # Basin 430 Gas Well Gas Production
+            basin_430_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
+                                    .query("AAPG_CODE_ERG == '430'")
+                                    .assign(proxy_data=lambda df: df[gas_prod_str])
+                                    .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .reset_index(drop=True)
+                                    )
+            basin_430_prod_df = pd.concat([basin_430_prod_df,basin_430_prod_imonth])
+            # Other Basins Gas Well Gas Production
+            basin_other_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
+                                      .query("AAPG_CODE_ERG != '220' & AAPG_CODE_ERG != '395' & AAPG_CODE_ERG != '430'")
+                                      .assign(proxy_data=lambda df: df[gas_prod_str])
+                                      .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
+                                      .rename(columns=lambda x: str(x).lower())
+                                      .reset_index(drop=True)
+                                      )
+            basin_other_prod_df = pd.concat([basin_other_prod_df,basin_other_prod_imonth])
+
+            # Water Production
+            # Data Source by state defined in Enverus DrillingInfo Processing - Produced
+            # Water_2023-11-14_forGridding.xlsx file.
+            if iyear < 2016:  # WV uses NEI data
+                water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA',
+                                             'MI','MO','MS','MT','ND','NE','NM','NV',
+                                             'NY','OH','SD','TX','UT','VA','WY'
+                                             ]
+                # States using NEI for reference: ['IL','IN','KS','OK','PA','WV']
+            else:  # 2016 and beyond; WV uses Enverus data
+                water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA',
+                                             'MI','MO','MS','MT','ND','NE','NM','NV',
+                                             'NY','OH','SD','TX','UT','VA','WY','WV'
+                                             ]  #WV uses Enverus
+                # States using NEI for reference: ['IL','IN','KS','OK','PA']
+            # Enverus water production for applicable states (NEI water producted will
+            # be added in the NEI section of the code below)
+            water_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',water_prod_str]]
+                                .query("STATE_CODE.isin(@water_prod_enverus_states)")
+                                .assign(proxy_data=lambda df: df[water_prod_str])
+                                .drop(columns=[water_prod_str])
+                                .rename(columns=lambda x: str(x).lower())
+                                .reset_index(drop=True)
+                                )
+            water_prod_df = pd.concat([water_prod_df,water_prod_imonth])
+
+            # Well Completions
+            # Conventional Gas Well Completions
+            conv_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']]
+                                    .query("HF != 'Y'")
+                                    .drop(columns=["HF"])
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .rename(columns={"well_count":"proxy_data"})
+                                    .query(f"comp_year_month == {year_month_str}")
+                                    .drop(columns=["comp_year_month"])
+                                    .reset_index(drop=True)
+                                    )
+            conv_well_comp_df = pd.concat([conv_well_comp_df,conv_well_comp_imonth])
+            
+            # HF Gas Well Completions
+            hf_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']]
+                                  .query("HF == 'Y'")
+                                  .drop(columns=["HF"])
+                                  .rename(columns=lambda x: str(x).lower())
+                                  .rename(columns={"well_count":"proxy_data"})
+                                  .query(f"comp_year_month == '{year_month_str}'")
+                                  .drop(columns=["comp_year_month"])
+                                  .reset_index(drop=True)
+                                  )
+            hf_well_comp_df = pd.concat([hf_well_comp_df,hf_well_comp_imonth])
+
+            # Drilled Gas Wells
+            drilled_well_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','spud_year','first_prod_year']]
+                                  .rename(columns=lambda x: str(x).lower())
+                                  .rename(columns={"well_count":"proxy_data"})
+                                  # wells with a spud date or first production date in the current year
+                                  .query(f"spud_year == '{iyear}' | first_prod_year == '{iyear}'")
+                                  # wells with a spud_year == iyear or if no spud date, first_prod_year == iyear
+                                  .query(f"spud_year == '{iyear}' | spud_year == 'NaN'")
+                                  .drop(columns=['hf', 'spud_year', 'first_prod_year'])
+                                  .reset_index(drop=True)
+                                  )
+            drilled_well_df = pd.concat([drilled_well_df,drilled_well_imonth])
+            
+            # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
+            state_gom_offshore_states = ['AL','FL','LA','MS','TX']
+            # Offshore State GOM Gas Well Counts
+            state_gom_offshore_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']]
+                                                   .rename(columns=lambda x: str(x).lower())
+                                                   .rename(columns={"well_count":"proxy_data"})
+                                                   .reset_index(drop=True)
+                                                   )
+            state_gom_offshore_well_count_df = pd.concat([state_gom_offshore_well_count_df,state_gom_offshore_well_count_imonth])
+            # Offshore State GOM Gas Well Gas Production
+            state_gom_offshore_well_prod_imonth = (ng_offshore_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]]
+                                                  .query("STATE_CODE.isin(@state_gom_offshore_states)")
+                                                  .assign(proxy_data=lambda df: df[gas_prod_str])
+                                                  .drop(columns=[gas_prod_str])
+                                                  .rename(columns=lambda x: str(x).lower())
+                                                  .reset_index(drop=True)
+                                                  )
+            state_gom_offshore_well_prod_df = pd.concat([state_gom_offshore_well_prod_df,state_gom_offshore_well_prod_imonth])
+
+    # Calculate Relative Emissions
+    def calc_enverus_rel_emi(df):
+        df['rel_emi'] = df.groupby(["state_code", "year"])['proxy_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
+        df = df.drop(columns='proxy_data')
+        return df
+
+    # Well Counts
+    all_well_count_df = calc_enverus_rel_emi(all_well_count_df)
+    conv_well_count_df = calc_enverus_rel_emi(conv_well_count_df)
+    hf_well_count_df = calc_enverus_rel_emi(hf_well_count_df)
+    # Well-Level Production Volumes
+    all_well_prod_df = calc_enverus_rel_emi(all_well_prod_df)
+    basin_220_prod_df = calc_enverus_rel_emi(basin_220_prod_df)
+    basin_395_prod_df = calc_enverus_rel_emi(basin_395_prod_df)
+    basin_430_prod_df = calc_enverus_rel_emi(basin_430_prod_df)
+    basin_other_prod_df = calc_enverus_rel_emi(basin_other_prod_df)
+    # Water Production Volumes
+    water_prod_df = calc_enverus_rel_emi(water_prod_df)
+    # Well Completions
+    conv_well_comp_df = calc_enverus_rel_emi(conv_well_comp_df)
+    hf_well_comp_df = calc_enverus_rel_emi(hf_well_comp_df)
+    # Drilled Gas Wells
+    drilled_well_df = calc_enverus_rel_emi(drilled_well_df)
+    # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
+    state_gom_offshore_well_count_df = calc_enverus_rel_emi(state_gom_offshore_well_count_df)
+    state_gom_offshore_well_prod_df = calc_enverus_rel_emi(state_gom_offshore_well_prod_df)
+
+    # Format Proxy Data into Geodataframes
+    def enverus_df_to_gdf(df):
+        gdf = (
+            gpd.GeoDataFrame(
+                df,
+                geometry=gpd.points_from_xy(
+                    df["longitude"],
+                    df["latitude"],
+                    crs=4326
+                )
+            )
+            .drop(columns=["latitude", "longitude"])
+            .loc[:, ["year", "year_month", "state_code", "rel_emi", "geometry"]]
+        )
+        return gdf
+
+    # Well Counts
+    all_well_count_gdf = enverus_df_to_gdf(all_well_count_df)
+    conv_well_count_gdf = enverus_df_to_gdf(conv_well_count_df)
+    hf_well_count_gdf = enverus_df_to_gdf(hf_well_count_df)
+    # Well-Level Production Volumes
+    all_well_prod_gdf = enverus_df_to_gdf(all_well_prod_df)
+    basin_220_prod_gdf = enverus_df_to_gdf(basin_220_prod_df)
+    basin_395_prod_gdf = enverus_df_to_gdf(basin_395_prod_df)
+    basin_430_prod_gdf = enverus_df_to_gdf(basin_430_prod_df)
+    basin_other_prod_gdf = enverus_df_to_gdf(basin_other_prod_df)
+    # Water Production Volumes
+    water_prod_gdf = enverus_df_to_gdf(water_prod_df)
+    # Well Completions
+    conv_well_comp_gdf = enverus_df_to_gdf(conv_well_comp_df)
+    hf_well_comp_gdf = enverus_df_to_gdf(hf_well_comp_df)
+    # Drilled Gas Wells
+    drilled_well_gdf = enverus_df_to_gdf(drilled_well_df)
+    # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
+    state_gom_offshore_well_count_gdf = enverus_df_to_gdf(state_gom_offshore_well_count_df)
+    state_gom_offshore_well_prod_gdf = enverus_df_to_gdf(state_gom_offshore_well_prod_df)
+
+    # STEP 2.4: Well and Production Data (from NEI)
+
+    # NEI data is used for well counts, gas well completion counts, 
+    # gas well drilled counts, and gas production volumes for IL and IN.
+
+    # NEI data is used for water production volumes for IL, IN, KS, OK, and PA 
+    # as well as WV for years less than 2016.
+
+    # FIPS codes for relevant states (each code starts with 2 distinct characters):
+    # IL: 17; IN: 18; KS: 20; OK: 40; PA: 42; WV: 54
+    
+    fips_codes_df = pd.DataFrame({'state_code': ['IL', 'IN', 'KS', 'OK', 'PA', 'WV'],
+                                  'fips_code': ['17', '18', '20', '40', '42', '54']})
+
+    # Function to get NEI textfile and shapefile data
+    def get_NEI_data(ghgi_year, data_year, file_name):
+        if data_year <= 2017:
+            # NEI textfile data (data_year <= 2017) (2011, 2014, 2016, 2017)
+            nei_textfile_name = f"CONUS_SA_FILES_{data_year}/{file_name}"
+            nei_textfile_path = os.path.join(nei_path, nei_textfile_name)
+            data_temp = pd.read_csv(nei_textfile_path, sep='\t', skiprows = 25)
+            data_temp = data_temp.drop(["!"], axis=1)
+            data_temp.columns = ['Code','FIPS','COL','ROW','Frac','Abs','FIPS_Total','FIPS_Running_Sum']
+            data_temp = data_temp.astype({"FIPS": str})
+            # if water production data (gas: 6832, oil: 6833)
+            if file_name == 'USA_6832_NOFILL.txt' or file_name == 'USA_6833_NOFILL.txt':
+                if data_year < 2016:
+                    data_temp = (data_temp
+                                # query states: IL, IN, KS, OK, PA, WV
+                                .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42') | FIPS.str.startswith('54')")
+                                .reset_index(drop=True)
+                                )
+                    colmax = data_temp['COL'].max()
+                    colmin = data_temp['COL'].min()
+                    rowmax = data_temp['ROW'].max()
+                    rowmin = data_temp['ROW'].min()
+                else:
+                    data_temp = (data_temp
+                                # query states: IL, IN, KS, OK, PA
+                                .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42')")
+                                .reset_index(drop=True)
+                                )
+                    colmax = data_temp['COL'].max()
+                    colmin = data_temp['COL'].min()
+                    rowmax = data_temp['ROW'].max()
+                    rowmin = data_temp['ROW'].min()
+            # non-water production proxies (IL, IN)
+            else:
+                data_temp = (data_temp
+                            # query states: IL, IN
+                            .query("FIPS.str.startswith('17') | FIPS.str.startswith('18')")
+                            .reset_index(drop=True)
+                            )
+                colmax = data_temp['COL'].max()
+                colmin = data_temp['COL'].min()
+                rowmax = data_temp['ROW'].max()
+                rowmin = data_temp['ROW'].min()
+            # NEI reference grid shapefile with lat/lon locations
+            nei_reference_grid_path = os.path.join(nei_path, "NEI_Reference_Grid_LCC_to_WGS84_latlon.shp")
+            nei_reference_grid = (gpd.read_file(nei_reference_grid_path)
+                                .to_crs(4326))
+            nei_reference_grid = (nei_reference_grid
+                                .assign(cellid_column = nei_reference_grid.cellid.astype(str).str[0:4].astype(int))
+                                .assign(cellid_row = nei_reference_grid.cellid.astype(str).str[5:].astype(int))
+                                .query(f"cellid_column <= {colmax} & cellid_column >= {colmin}")
+                                .query(f"cellid_row <= {rowmax} & cellid_row >= {rowmin}")
+                                .reset_index(drop=True)
+                                )
+            # Match lat/lon locations from reference grid to nei data
+            for idx in np.arange(0,len(data_temp)):
+                # Add in lat/lon
+                icol = data_temp['COL'][idx]
+                irow = data_temp['ROW'][idx]
+                match = np.where((icol == nei_reference_grid.loc[:,'cellid_column']) & (irow == nei_reference_grid.loc[:,'cellid_row']))[0][0]
+                match = int(match)
+                # data_temp.loc[idx,'Lat'] = nei_reference_grid.loc[match, 'Latitude']
+                # data_temp.loc[idx,'Lon'] = nei_reference_grid.loc[match, 'Longitude']
+                data_temp.loc[idx,'geometry'] = nei_reference_grid.loc[match, 'geometry']
+                # Add in state_code
+                ifips = data_temp.loc[idx,'FIPS'][0:2]
+                data_temp.loc[idx,'state_code'] = fips_codes_df.loc[np.where(ifips == fips_codes_df.loc[:, 'fips_code'])[0][0],'state_code']
+            data_temp = data_temp[['state_code', 'Abs', 'geometry']]
+            data_temp = data_temp.rename(columns={'Abs':'activity_data'})
+        
+        else:
+            # NEI shapefile data (data_year > 2017) (2018, 2019, 2021, 2022)
+            state_geometries = state_gdf[["state_code","geometry"]]
+            nei_file_name = f"CONUS_SA_FILES_{data_year}"
+            nei_file_path = os.path.join(nei_path, nei_file_name)
+            data_temp = gpd.read_file(nei_file_path, layer=file_name)
+            data_temp = data_temp.to_crs(4326)
+            data_temp = gpd.tools.sjoin(data_temp, state_gdf, how="left")
+
+            # water production data (IL, IN, KS, OK, PA)
+            if file_name == 'PRODUCED_WATER_GAS' or file_name == '_6832' or file_name == 'ProducedWaterGasWells':
+                states_to_query = ['IL', 'IN', 'KS', 'OK', 'PA']
+            # non-water production proxies (IL, IN)
+            else:
+                states_to_query = ['IL', 'IN']
+            
+            # query relevant states
+            data_temp = data_temp.query('state_code.isin(@states_to_query)')
+
+            # grab activity data depending on column name (changes by year)
+            if data_year == 2018 or data_year == 2019 or data_year == 2020:
+                data_temp = data_temp[['state_code', 'ACTIVITY', 'geometry']]
+                data_temp = data_temp.rename(columns={'ACTIVITY':'activity_data'})            
+            if data_year == 2021:
+                data_temp = data_temp[['state_code', 'GRID_AC', 'geometry']]
+                data_temp = data_temp.rename(columns={'GRID_AC':'activity_data'})
+            if data_year == 2022:
+                data_temp = data_temp[['state_code', 'GRID_ACTIV', 'geometry']]
+                data_temp = data_temp.rename(columns={'GRID_ACTIV':'activity_data'})
+        
+        # convert activity data to relative emissions (idata / sum(state data))
+        data_temp['rel_emi'] = data_temp.groupby(["state_code"])['activity_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
+        monthly_data_temp = data_temp.copy()
+        monthly_data_temp['rel_emi'] = monthly_data_temp['rel_emi'] * 1/12
+        monthly_data_temp = monthly_data_temp.drop(columns='activity_data')
+
+        # convert proxy data to monthly (assume 1/12 of annual proxy is assigned to each month)
+        nei_proxy_data = pd.DataFrame()
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            data_temp_imonth = monthly_data_temp.copy()
+            data_temp_imonth = data_temp_imonth.assign(year_month=str(ghgi_year)+'-'+imonth_str)
+            nei_proxy_data = pd.concat([nei_proxy_data,data_temp_imonth])
+        nei_proxy_data = nei_proxy_data.assign(year=ghgi_year)
+        nei_proxy_data = (nei_proxy_data[['year', 'year_month', 'state_code', 'rel_emi', 'geometry']]
+                          .reset_index(drop=True)
+                          )
+        return nei_proxy_data
+
+    # NEI data year assignments
+    # All years use the data affiliated with their year except the following exceptions:
+        # 2012: use 2011 data
+        # 2013: use 2014 data
+        # 2015: use 2014 data
+        # 2016: use 2017 data
+    nei_data_years = pd.DataFrame({'year': [2012,
+                                            2013,
+                                            2014,
+                                            2015,
+                                            2016,
+                                            2017,
+                                            2018,
+                                            2019,
+                                            2020,
+                                            2021,
+                                            2022], 
+                                   'nei_data': [2011,
+                                                2014,
+                                                2014,
+                                                2014,
+                                                2017,
+                                                2017,
+                                                2018,
+                                                2019,
+                                                2020,
+                                                2021,
+                                                2022]})
+
+    # NEI Data Dataframes:
+    # Well Counts
+    nei_all_well_count_df = pd.DataFrame()  # Active gas well (conventional + HF) counts in a given month
+    nei_conv_well_count_df = pd.DataFrame()  # Active conventional gas well counts in a given month
+    nei_hf_well_count_df = pd.DataFrame()  # Active HF gas well counts in a given month
+    # Well-Level Production Volumes
+    nei_all_well_prod_df = pd.DataFrame()  # Active gas well (conventional + HF) gas production in a given month
+    nei_basin_other_prod_df = pd.DataFrame()  # Gas well gas production in Other Basins in a given month
+    # Water Production Volumes
+    nei_water_prod_df = pd.DataFrame()
+    # Well Completions
+    nei_conv_well_comp_df = pd.DataFrame()  # Conventional gas well completions
+    nei_hf_well_comp_df = pd.DataFrame()  # HF gas well completions
+    # Drilled Gas Wells
+    nei_drilled_well_df = pd.DataFrame()  # Gas wells drilled
+
+    # NEI text file and shapefile names:
+    # Well Counts
+    well_count_file_names = pd.DataFrame({
+        'data_year': [2011, 2014, 2017,
+                      2018, 2019, 2020, 2021, 2022],
+        'file_name': ['USA_698_NOFILL.txt', 'USA_698_NOFILL.txt', 'USA_698_NOFILL.txt',
+                      'GAS_WELLS', 'GAS_WELLS', 'GAS_WELL', '_698', 'GasWells'],
+        })
+    # Well-Level Production Volumes
+    gas_prod_file_names = pd.DataFrame({
+        'data_year': [2011, 2014, 2017,
+                      2018, 2019, 2020, 2021, 2022],
+        'file_name': ['USA_696_NOFILL.txt', 'USA_696_NOFILL.txt', 'USA_696_NOFILL.txt',
+                      'GAS_PRODUCTION', 'GAS_PRODUCTION', 'GAS_PRODUCTION', '_696', 'GasProduction'],
+        })
+    # Water Production Volumes
+    water_prod_file_names = pd.DataFrame({
+        'data_year': [2011, 2014, 2017,
+                      2018, 2019, 2020, 2021, 2022],
+        'file_name': ['USA_6832_NOFILL.txt', 'USA_6832_NOFILL.txt', 'USA_6832698_NOFILL.txt',
+                      'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', '_6832', 'ProducedWaterGasWells'],
+        })
+    # Well Completions
+    comp_count_file_names = pd.DataFrame({
+        'data_year': [2011, 2014, 2017,
+                      2018, 2019, 2020, 2021, 2022],
+        'file_name': ['USA_678_NOFILL.txt', 'USA_678_NOFILL.txt', 'USA_678_NOFILL.txt',
+                      'COMPLETIONS_GAS', 'COMPLETIONS_GAS', 'COMPLETIONS_GAS', '_678', 'GasWellCompletions'],
+        })
+    # Drilled Gas Wells
+    spud_count_file_names = pd.DataFrame({
+        'data_year': [2011, 2014, 2017,
+                      2018, 2019, 2020, 2021, 2022],
+        'file_name': ['USA_671_NOFILL.txt', 'USA_671_NOFILL.txt', 'USA_671_NOFILL.txt',
+                      'SPUD_GAS', 'SPUD_GAS', 'SPUD_GAS', '_671', 'SpudCountGasWells'],
+        })
+    
+    
+    def get_nei_file_name(nei_data_year, nei_file_names):
+        nei_file_name = nei_file_names[nei_file_names['data_year'] == nei_data_year]['file_name'].values[0]
+        return nei_file_name
+
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, well_count_file_names)
+        nei_all_well_count_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_all_well_count_df = pd.concat([nei_all_well_count_df, nei_all_well_count_iyear])
+        # Gas Production
+        ifile_name = get_nei_file_name(nei_data_year, gas_prod_file_names)
+        nei_all_well_prod_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_all_well_prod_df = pd.concat([nei_all_well_prod_df, nei_all_well_prod_iyear])
+        # Water Production
+        ifile_name = get_nei_file_name(nei_data_year, water_prod_file_names)
+        nei_water_prod_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_water_prod_df = pd.concat([nei_water_prod_df, nei_water_prod_iyear])
+        # Completions Count
+        ifile_name = get_nei_file_name(nei_data_year, comp_count_file_names)
+        nei_conv_well_comp_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_conv_well_comp_df = pd.concat([nei_conv_well_comp_df, nei_conv_well_comp_iyear])
+        # Spud Count
+        ifile_name = get_nei_file_name(nei_data_year, spud_count_file_names)
+        nei_drilled_well_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_drilled_well_df = pd.concat([nei_drilled_well_df, nei_drilled_well_iyear])
+    
+    # Copy Data to Other Dataframes
+    nei_conv_well_count_df = nei_all_well_count_df.copy()
+    nei_hf_well_count_df = nei_all_well_count_df.copy()
+    nei_basin_other_prod_df = nei_all_well_prod_df.copy()
+    nei_hf_well_comp_df = nei_conv_well_comp_df.copy()
+
+    # Add NEI Data to Enverus Data
+    # Well Counts
+    all_well_count_gdf = pd.concat([all_well_count_gdf, nei_all_well_count_df]).reset_index(drop=True)
+    conv_well_count_gdf = pd.concat([conv_well_count_gdf, nei_conv_well_count_df]).reset_index(drop=True)
+    hf_well_count_gdf = pd.concat([hf_well_count_gdf, nei_hf_well_count_df]).reset_index(drop=True)
+    # Well-Level Production Volumes
+    all_well_prod_gdf = pd.concat([all_well_prod_gdf, nei_all_well_prod_df]).reset_index(drop=True)
+    basin_220_prod_gdf = basin_220_prod_df.reset_index(drop=True)  # No IL/IN data to add
+    basin_395_prod_gdf = basin_395_prod_df.reset_index(drop=True)  # No IL/IN data to add
+    basin_430_prod_gdf = basin_430_prod_df.reset_index(drop=True)  # No IL/IN data to add
+    basin_other_prod_gdf = pd.concat([basin_other_prod_gdf, nei_basin_other_prod_df]).reset_index(drop=True)
+    # Water Production Volumes
+    water_prod_gdf = pd.concat([water_prod_gdf, nei_water_prod_df]).reset_index(drop=True)
+    # Well Completions
+    conv_well_comp_gdf = pd.concat([conv_well_comp_gdf, nei_conv_well_comp_df]).reset_index(drop=True)
+    hf_well_comp_gdf = pd.concat([hf_well_comp_gdf, nei_hf_well_comp_df]).reset_index(drop=True)
+    # Drilled Gas Wells
+    drilled_well_gdf = pd.concat([drilled_well_gdf, nei_drilled_well_df]).reset_index(drop=True)
+    # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
+    state_gom_offshore_well_count_gdf = state_gom_offshore_well_count_df.reset_index(drop=True)  # No IL/IN data to add
+    state_gom_offshore_well_prod_gdf = state_gom_offshore_well_prod_df.reset_index(drop=True)  # No IL/IN data to add
+
+    # Output Proxy Parquet Files
+    all_well_count_gdf.to_parquet(all_well_count_output_path)
+    conv_well_count_gdf.to_parquet(conv_well_count_output_path)
+    hf_well_count_gdf.to_parquet(hf_well_count_output_path)
+    all_well_prod_gdf.to_parquet(all_well_prod_output_path)
+    basin_220_prod_gdf.to_parquet(basin_220_prod_output_path)
+    basin_395_prod_gdf.to_parquet(basin_395_prod_output_path)
+    basin_430_prod_gdf.to_parquet(basin_430_prod_output_path)
+    basin_other_prod_gdf.to_parquet(basin_other_prod_output_path)
+    water_prod_gdf.to_parquet(water_prod_output_path)
+    conv_well_comp_gdf.to_parquet(conv_well_comp_output_path)
+    hf_well_comp_gdf.to_parquet(hf_well_comp_output_path)
+    drilled_well_gdf.to_parquet(drilled_well_output_path)
+    state_gom_offshore_well_count_gdf.to_parquet(state_gom_offshore_well_count_output_path)
+    state_gom_offshore_well_prod_gdf.to_parquet(state_gom_offshore_well_prod_output_path)
+    return None
+
+
+
+    
+

From 062cafb3e19c7fa34ca3cc4657ff6209ae9b91fe Mon Sep 17 00:00:00 2001
From: Hannah Lohman <68960449+haclohman@users.noreply.github.com>
Date: Wed, 4 Dec 2024 18:47:17 -0500
Subject: [PATCH 4/6] Updating industrial landfills proxy to rel_emi

---
 .../task_industrial_landfills_proxy.py                | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/gch4i/proxy_processing/task_industrial_landfills_proxy.py b/gch4i/proxy_processing/task_industrial_landfills_proxy.py
index 5e75ffa..0e9e88b 100644
--- a/gch4i/proxy_processing/task_industrial_landfills_proxy.py
+++ b/gch4i/proxy_processing/task_industrial_landfills_proxy.py
@@ -99,6 +99,9 @@ def task_get_reporting_industrial_landfills_pulp_paper_proxy_data(
         .loc[:, ["facility_name", "state_code", "geometry", "year", "ch4_kt"]]
     )
 
+    reporting_pulp_paper_gdf['rel_emi'] = reporting_pulp_paper_gdf.groupby(["state_code", "year"])['ch4_kt'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
+    reporting_pulp_paper_gdf = reporting_pulp_paper_gdf.drop(columns='ch4_kt')
+
     reporting_pulp_paper_gdf.to_parquet(reporting_pulp_paper_proxy_output_path)
     return None
 
@@ -270,6 +273,9 @@ def task_get_nonreporting_industrial_landfills_pulp_paper_proxy_data(
         .loc[:, ["state_code", "geometry", "ch4_kt"]]
     )
 
+    nonreporting_pulp_paper_gdf['rel_emi'] = nonreporting_pulp_paper_gdf.groupby(["state_code"])['ch4_kt'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
+    nonreporting_pulp_paper_gdf = nonreporting_pulp_paper_gdf.drop(columns='ch4_kt')
+
     nonreporting_pulp_paper_gdf.to_parquet(nonreporting_pulp_paper_proxy_output_path)
     return None
 
@@ -340,6 +346,9 @@ def task_get_reporting_industrial_landfills_food_beverage_proxy_data(
         .loc[:, ["facility_id", "facility_name", "state_code", "geometry", "year", "ch4_kt"]]
     )
 
+    reporting_food_beverage_gdf['rel_emi'] = reporting_food_beverage_gdf.groupby(["state_code"])['ch4_kt'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
+    reporting_food_beverage_gdf = reporting_food_beverage_gdf.drop(columns='ch4_kt')
+    
     reporting_food_beverage_gdf.to_parquet(reporting_food_beverage_proxy_output_path)
     return None
 
@@ -606,6 +615,8 @@ def task_get_nonreporting_industrial_landfills_food_beverage_proxy_data(
                        "ghgrp_match", "FRS_match", "geo_match"])
         .loc[:, ["facility_id", "state_code", "geometry", "avg_waste_t"]]
     )
+    nonreporting_food_beverage_gdf['rel_emi'] = nonreporting_food_beverage_gdf.groupby(["state_code", "year"])['avg_waste_t'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
+    nonreporting_food_beverage_gdf = nonreporting_food_beverage_gdf.drop(columns='avg_waste_t')
 
     nonreporting_food_beverage_gdf.to_parquet(nonreporting_food_beverage_proxy_output_path)
     return None

From 0ff005b19f88b56008426e8cf2af4084c0f3c154 Mon Sep 17 00:00:00 2001
From: Hannah Lohman <68960449+haclohman@users.noreply.github.com>
Date: Wed, 4 Dec 2024 18:47:50 -0500
Subject: [PATCH 5/6] Final federal gom offshore proxy for oil and gas
 production

---
 .../federal_gom_offshore_proxy.py             | 486 +++++++++++-------
 1 file changed, 293 insertions(+), 193 deletions(-)

diff --git a/gch4i/proxy_processing/federal_gom_offshore_proxy.py b/gch4i/proxy_processing/federal_gom_offshore_proxy.py
index 3eb351d..31468c8 100644
--- a/gch4i/proxy_processing/federal_gom_offshore_proxy.py
+++ b/gch4i/proxy_processing/federal_gom_offshore_proxy.py
@@ -22,6 +22,7 @@
     sector_data_dir_path,
     max_year,
     min_year,
+    years,
 )
 
 from gch4i.utils import us_state_to_abbrev
@@ -31,38 +32,53 @@
 @task(id="federal_gom_offshore_proxy")
 def task_get_federal_gom_offshore_proxy_data(
     state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
-    boem_data_directory_path: Path = sector_data_dir_path / "boem",
-    ng_output_path: Annotated[Path, Product] = proxy_data_dir_path
-    / "federal_gom_offshore_proxy.parquet",
-    oil_output_path: Annotated[Path, Product] = proxy_data_dir_path
-    / "oil_gom_fed_proxy.parquet",
+    GOADS_11_path: Path = sector_data_dir_path / "boem" / "2011_Gulfwide_Platform_Inventory.accdb",
+    GOADS_14_path: Path = sector_data_dir_path / "boem" / "2014_Gulfwide_Platform_Inventory.accdb",
+    GOADS_17_path: Path = sector_data_dir_path / "boem" / "2017_Gulfwide_Platform_Inventory.accdb",
+    ERG_GOADSEmissions_path: Path = sector_data_dir_path / "boem" / "BOEM GEI Emissions Data_EmissionSource_2020-03-11.xlsx",
+    ng_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_federal_gom_offshore_proxy.parquet",
+    ng_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_federal_gom_offshore_proxy.parquet",
+    oil_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_federal_gom_offshore_proxy.parquet",
 ):
     """
     # TODO:
     """
 
-    state_gdf = (
-        gpd.read_file(state_path)
-        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
-        .rename(columns=str.lower)
-        .rename(columns={"stusps": "state_code", "name": "state_name"})
-        .astype({"statefp": int})
-        # get only lower 48 + DC
-        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
-        .reset_index(drop=True)
-        .to_crs(4326)
-    )
+    # Get and format BOEM GOM data for 2011, 2014, and 2017
 
-    # get and format boem gom data for 2011, 2014, 2017, and 2021
-    # NOTE: 2011 has tblPointER and tblPointEM but the rest of the years have one single table of data
-    gom_df = pd.DataFrame()
+    # GOADS data year assignments
+        # 2011 data: 2012
+        # 2014 data: 2013, 2014, 2015
+        # 2017 data: 2016-2022
+        # 2021 data: NOT USED BY GHGI TEAM YET - CHECK FOR V4
 
-    # 2011 GOADS Data
+    federal_gom_offshore_data_years = pd.DataFrame(
+        {'year': [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022],
+         'goads_data': [2011, 2014, 2014, 2014, 2017, 2017, 2017, 2017, 2017, 2017, 2017]
+         })
+
+    # Use ERG Preprocessed data to determine if oil or gas
+    ERG_complex_crosswalk = (pd.read_excel(
+        ERG_GOADSEmissions_path,
+        sheet_name = "Complex Emissions by Source",
+        usecols = "AJ:AM",
+        nrows = 11143)
+        .rename(columns={"Year.2": "year",
+                         "BOEM COMPLEX ID.2": "boem_complex_id",
+                         "Oil Gas Defn FINAL.1": "oil_gas_defn",
+                         "Major / Minor.1": "major_minor"})
+        .query("year == 2011 | year == 2014 | year == 2017")
+        .astype({"boem_complex_id": int})
+        .drop(columns="major_minor")  # no longer separating major vs. minor in v3
+        .replace('', np.nan)
+        .dropna()
+        .reset_index(drop=True)
+        )
 
+    # 2011 GOADS Data
     # Read In and Format 2011 BEOM Data
-    gom_file_name = f"2011_Gulfwide_Platform_Inventory.accdb"
-    gom_file_path = os.path.join(boem_data_directory_path, gom_file_name)
-    driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+gom_file_path+';'''
+    GOADS_11_inputfile = str(GOADS_11_path)
+    driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+GOADS_11_inputfile+';'''
     conn = pyodbc.connect(driver_str)
     GOADS_locations = pd.read_sql("SELECT * FROM tblPointER", conn)
     GOADS_emissions = pd.read_sql("SELECT * FROM tblPointEM", conn)
@@ -70,7 +86,7 @@ def task_get_federal_gom_offshore_proxy_data(
 
     # Format Location Data
     GOADS_locations = GOADS_locations[["strStateFacilityIdentifier","strEmissionReleasePointID","dblXCoordinate","dblYCoordinate"]]
-    #Create platform-by-platform file
+    # Create platform-by-platform file
     GOADS_locations_Unique = pd.DataFrame({'strStateFacilityIdentifier':GOADS_locations['strStateFacilityIdentifier'].unique()})
     GOADS_locations_Unique['lon'] = 0.0
     GOADS_locations_Unique['lat'] = 0.0
@@ -82,192 +98,276 @@ def task_get_federal_gom_offshore_proxy_data(
         GOADS_locations_Unique.loc[iplatform,'lat',] = GOADS_locations['dblYCoordinate'][match_platform]
         GOADS_locations_Unique.loc[iplatform,'strEmissionReleasePointID'] = GOADS_locations['strEmissionReleasePointID'][match_platform][:3]
 
-    GOADS_locations_Unique.reset_index(inplace=True, drop=True)
-    #display(GOADS_locations_Unique)
-
-    #print(GOADS_emissions.columns)
-    #Format Emissions Data (clean lease data string)
-    GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode","dblEmissionNumericValue","BOEM-MONTH",
-                                  "BOEM-LEASE_NUM","BOEM-COMPLEX_ID"]]
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('OCS','')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('-','')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace(' ','')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G1477','G01477')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G73','00073')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G605','00605')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G72','00072')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G599','00599')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G7155','G07155')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G2357','G02357')
-    GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G4921','G04921')
-    GOADS_emissions['Emis_tg'] = 0.0
-    GOADS_emissions['Emis_tg'] = 9.0718474E-7 * GOADS_emissions['dblEmissionNumericValue'] #convert short tons to Tg
-    GOADS_emissions = GOADS_emissions[GOADS_emissions['strPollutantCode'] == 'CH4']
-    GOADS_emissions.reset_index(inplace=True, drop=True)
-
-    #display(GOADS_emissions)
+    GOADS_locations_Unique = (GOADS_locations_Unique
+                              .drop(columns='strEmissionReleasePointID')
+                              .replace('', np.nan)
+                              .dropna()
+                              .reset_index(drop=True))
 
-    # Use ERG Preprocessed data to determine if major or minor and oil or gas
-    ERG_complex_crosswalk = pd.read_excel(ERG_GOADSEmissions_inputfile, sheet_name = "Complex Emissions by Source", usecols = "AJ:AM", nrows = 11143)
-    #display(ERG_complex_crosswalk)
+    # Format Emissions Data (clean lease data string)
+    GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode",
+                                       "dblEmissionNumericValue","BOEM-MONTH",
+                                       "BOEM-COMPLEX_ID"]]
+    GOADS_emissions = (GOADS_emissions
+                       .query("strPollutantCode == 'CH4'")
+                       .assign(Emis_tg = 0.0)
+                       .assign(Emis_tg = lambda df: 9.0718474E-7 * df['dblEmissionNumericValue']) #convert short tons to Tg
+                       .rename(columns={"BOEM-COMPLEX_ID": "boem_complex_id"})
+                       .astype({"boem_complex_id": int})
+                       .drop(columns={"strPollutantCode", "dblEmissionNumericValue"})
+                       .replace('', np.nan)
+                       .dropna()
+                       .reset_index(drop=True)
+                       )
 
-    # add data to map array, for the closest year to 2011
-    year_diff = [abs(x - 2011) for x in year_range]
-    iyear = year_diff.index(min(year_diff))
+    # Select 2011 data from ERG complex crosswalk
+    ERG_complex_crosswalk_2011 = ERG_complex_crosswalk.copy().query('year == 2011').reset_index(drop=True)
 
-    #assign oil vs gas by lease/complex ID
-    GOADS_emissions['LEASE_TYPE'] =''
-    GOADS_emissions['MAJOR_STRUC'] =''
-    for istruc in np.arange(0,len(GOADS_emissions)):
-        imatch = np.where(np.logical_and(ERG_complex_crosswalk['BOEM COMPLEX ID.2']==int(GOADS_emissions['BOEM-COMPLEX_ID'][istruc]),\
-                            ERG_complex_crosswalk['Year.2'] == 2011))
-        if np.size(imatch) >0:
-            imatch = imatch[0][0]
-            GOADS_emissions.loc[istruc,'LEASE_TYPE'] = ERG_complex_crosswalk['Oil Gas Defn FINAL.1'][imatch]
-            GOADS_emissions.loc[istruc,'MAJOR_STRUC'] = ERG_complex_crosswalk['Major / Minor.1'][imatch]
-        else:
-            print(istruc, GOADS_emissions['BOEM-COMPLEX_ID'][istruc])
+    # Join locations, emissions, and complex types together
+    federal_gom_offshore_2011 = (GOADS_emissions
+                                 .set_index("boem_complex_id")
+                                 .join(ERG_complex_crosswalk_2011.set_index("boem_complex_id"))
+                                 .reset_index()
+                                 .set_index("strStateFacilityIdentifier")
+                                 .join(GOADS_locations_Unique.set_index("strStateFacilityIdentifier"))
+                                 .reset_index()
+                                 .astype({"BOEM-MONTH": str})
+                                 .assign(month=lambda df: df['BOEM-MONTH'].astype(str).str.zfill(2))
+                                 .assign(state_code='FO')
+                                 .drop(columns={'strStateFacilityIdentifier', 'BOEM-MONTH'})
+                                 )
+    federal_gom_offshore_2011_gdf = (
+        gpd.GeoDataFrame(
+            federal_gom_offshore_2011,
+            geometry=gpd.points_from_xy(
+                federal_gom_offshore_2011["lon"],
+                federal_gom_offshore_2011["lat"],
+                crs=4326
+            )
+        )
+        .drop(columns=["lat", "lon"])
+        .loc[:, ["boem_complex_id", "year", "month", "state_code", "Emis_tg", "geometry", "oil_gas_defn"]]
+    )
 
-        # for all gas platforms, match the platform to the emissions
-        if GOADS_emissions['LEASE_TYPE'][istruc] =='Gas':
-            match_platform = np.where(GOADS_locations_Unique.strStateFacilityIdentifier==GOADS_emissions['strStateFacilityIdentifier'][istruc])[0][0]
-            ilat = int((GOADS_locations_Unique['lat'][match_platform] - Lat_low)/Res01)
-            ilon = int((GOADS_locations_Unique['lon'][match_platform] - Lon_left)/Res01)
-            imonth = GOADS_emissions['BOEM-MONTH'][istruc]-1 #dict is 1-12, not 0-11
-            if GOADS_emissions['MAJOR_STRUC'][istruc] =='Major':
-                Map_GOADSmajor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc]
-            else:
-                Map_GOADSminor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc]
-            
-            
-    # sum complexes and emissions for diagnostic
-    majcplx = GOADS_emissions[(GOADS_emissions['MAJOR_STRUC']=='Major')]
-    majcplx = majcplx[majcplx['LEASE_TYPE'] =='Gas']
-    num_majcplx = majcplx['BOEM-COMPLEX_ID'].unique()
-    #print(np.shape(num_majcplx))
-    mincplx = GOADS_emissions[GOADS_emissions['MAJOR_STRUC']=='Minor']
-    mincplx = mincplx[mincplx['LEASE_TYPE'] =='Gas']
-    num_mincplx = mincplx['BOEM-COMPLEX_ID'].unique()
-    #print(np.size(num_mincplx))            
-    del GOADS_emissions
-    print('Number of Major Gas Complexes: ',(np.size(num_majcplx)))
-    print('Emissions (Tg): ',np.sum(Map_GOADSmajor_emissions[:,:,iyear,:]))
-    print('Number of Minor Gas Complexes: ',(np.size(num_mincplx)))
-    print('Emissions (Tg): ',np.sum(Map_GOADSminor_emissions[:,:,iyear,:]))
+    # Separate out ng and oil
+    ng_federal_gom_offshore_2011_gdf = (federal_gom_offshore_2011_gdf
+                                        .query("oil_gas_defn == 'Gas'")
+                                        .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0))
+                                        .drop(columns={'Emis_tg', 'oil_gas_defn'})
+                                        .reset_index(drop=True)
+                                        )
+    oil_federal_gom_offshore_2011_gdf = (federal_gom_offshore_2011_gdf
+                                         .query("oil_gas_defn == 'Oil'")
+                                         .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0))
+                                         .drop(columns={'Emis_tg', 'oil_gas_defn'})
+                                         .reset_index(drop=True)
+                                         )
 
+    # 2014 GOADS Data
+    # Read In and Format 2014 BEOM Data
+    GOADS_14_inputfile = str(GOADS_14_path)
+    driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+GOADS_14_inputfile+';'''
+    conn = pyodbc.connect(driver_str)
+    GOADS_emissions = pd.read_sql("SELECT * FROM 2014_Gulfwide_Platform_20161102", conn)
+    conn.close()
 
+    # Format Emissions Data (clean lease data string)
+    GOADS_emissions = GOADS_emissions[["X_COORDINATE", "Y_COORDINATE", "POLLUTANT_CODE",
+                                       "EMISSIONS_VALUE", "MONTH", "COMPLEX_ID"]]
+    GOADS_emissions = (GOADS_emissions
+                       .query("POLLUTANT_CODE == 'CH4'")
+                       .assign(Emis_tg = 0.0)
+                       .assign(Emis_tg = lambda df: 9.0718474E-7 * df['EMISSIONS_VALUE']) #convert short tons to Tg
+                       .rename(columns={"COMPLEX_ID": "boem_complex_id"})
+                       .astype({"boem_complex_id": int})
+                       .drop(columns={"POLLUTANT_CODE", "EMISSIONS_VALUE"})
+                       .replace('', np.nan)
+                       .dropna()
+                       .reset_index(drop=True)
+                       )
 
+    # Select 2014 data from ERG complex crosswalk
+    ERG_complex_crosswalk_2014 = ERG_complex_crosswalk.copy().query('year == 2014').reset_index(drop=True)
 
-    gom_data_years = ['2011', '2014', '2017', '2021']
-    for idatayear in gom_data_years:
-        gom_file_name = f"{idatayear}_Gulfwide_Platform_Inventory.accdb"
-        gom_file_path = os.path.join(boem_data_directory_path, gom_file_name)
-        driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+gom_file_path+';'''
-        conn = pyodbc.connect(driver_str)
-        GOADS_locations = pd.read_sql("SELECT * FROM tblPointER", conn)
-        GOADS_emissions = pd.read_sql("SELECT * FROM tblPointEM", conn)
-        conn.close()
-                                    
-        # Format Location Data
-        GOADS_locations = GOADS_locations[["strStateFacilityIdentifier","strEmissionReleasePointID","dblXCoordinate","dblYCoordinate"]]
-        #Create platform-by-platform file
-        GOADS_locations_Unique = pd.DataFrame({'strStateFacilityIdentifier':GOADS_locations['strStateFacilityIdentifier'].unique()})
-        GOADS_locations_Unique['lon'] = 0.0
-        GOADS_locations_Unique['lat'] = 0.0
-        GOADS_locations_Unique['strEmissionReleasePointID'] = ''
+    # Join locations, emissions, and complex types together
+    federal_gom_offshore_2014 = (GOADS_emissions
+                                 .set_index("boem_complex_id")
+                                 .join(ERG_complex_crosswalk_2014.set_index("boem_complex_id"))
+                                 .reset_index()
+                                 .astype({"MONTH": str})
+                                 .assign(state_code='FO')
+                                 .rename(columns={'X_COORDINATE': 'lon', 'Y_COORDINATE': 'lat', 'MONTH': 'month'})
+                                 )
+    
+    # Correct months to be numeric digits
+    month_to_mm_df = pd.DataFrame(
+        {'month': ['January', 'February', 'March', 'April', 'May', 'June', 'July',
+                   'August', 'September', 'October', 'November', 'December'],
+         'mm': ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
+         })
+    federal_gom_offshore_2014 = (federal_gom_offshore_2014
+                                 .merge(month_to_mm_df, how='left')
+                                 .drop(columns='month')
+                                 .rename(columns={'mm': 'month'})
+                                 )
 
-        for iplatform in np.arange(len(GOADS_locations_Unique)):
-            match_platform = np.where(GOADS_locations['strStateFacilityIdentifier'] == GOADS_locations_Unique['strStateFacilityIdentifier'][iplatform])[0][0]
-            GOADS_locations_Unique.loc[iplatform,'lon',] = GOADS_locations['dblXCoordinate'][match_platform]
-            GOADS_locations_Unique.loc[iplatform,'lat',] = GOADS_locations['dblYCoordinate'][match_platform]
-            GOADS_locations_Unique.loc[iplatform,'strEmissionReleasePointID'] = GOADS_locations['strEmissionReleasePointID'][match_platform][:3]
+    federal_gom_offshore_2014_gdf = (
+        gpd.GeoDataFrame(
+            federal_gom_offshore_2014,
+            geometry=gpd.points_from_xy(
+                federal_gom_offshore_2014["lon"],
+                federal_gom_offshore_2014["lat"],
+                crs=4326
+            )
+        )
+        .drop(columns=["lat", "lon"])
+        .loc[:, ["boem_complex_id", "year", "month", "state_code", "Emis_tg", "geometry", "oil_gas_defn"]]
+    )
 
-        GOADS_locations_Unique.reset_index(inplace=True, drop=True)
-        #display(GOADS_locations_Unique)
+    # Separate out ng and oil
+    ng_federal_gom_offshore_2014_gdf = (federal_gom_offshore_2014_gdf
+                                        .query("oil_gas_defn == 'Gas'")
+                                        .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0))
+                                        .drop(columns={'Emis_tg', 'oil_gas_defn'})
+                                        .reset_index(drop=True)
+                                        )
+    oil_federal_gom_offshore_2014_gdf = (federal_gom_offshore_2014_gdf
+                                         .query("oil_gas_defn == 'Oil'")
+                                         .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0))
+                                         .drop(columns={'Emis_tg', 'oil_gas_defn'})
+                                         .reset_index(drop=True)
+                                         )
 
-        #print(GOADS_emissions.columns)
-        #Format Emissions Data (clean lease data string)
-        GOADS_emissions = GOADS_emissions[["strStateFacilityIdentifier","strPollutantCode","dblEmissionNumericValue","BOEM-MONTH",
-                                    "BOEM-LEASE_NUM","BOEM-COMPLEX_ID"]]
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('OCS','')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('-','')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace(' ','')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G1477','G01477')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G73','00073')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G605','00605')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G72','00072')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G599','00599')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G7155','G07155')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G2357','G02357')
-        GOADS_emissions['BOEM-LEASE_NUM'] = GOADS_emissions['BOEM-LEASE_NUM'].str.replace('G4921','G04921')
-        GOADS_emissions['Emis_tg'] = 0.0
-        GOADS_emissions['Emis_tg'] = 9.0718474E-7 * GOADS_emissions['dblEmissionNumericValue'] #convert short tons to Tg
-        GOADS_emissions = GOADS_emissions[GOADS_emissions['strPollutantCode'] == 'CH4']
-        GOADS_emissions.reset_index(inplace=True, drop=True)
+    # 2017 GOADS Data
+    # Read In and Format 2017 BEOM Data
+    GOADS_17_inputfile = str(GOADS_17_path)
+    driver_str = r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+GOADS_17_inputfile+';'''
+    conn = pyodbc.connect(driver_str)
+    GOADS_emissions = pd.read_sql("SELECT * FROM 2017_Gulfwide_Platform_20190705_CAP_GHG", conn)
+    conn.close()
 
-        #display(GOADS_emissions)
+    # Format Emissions Data (clean lease data string)
+    GOADS_emissions = GOADS_emissions[["X_COORDINATE", "Y_COORDINATE", "POLLUTANT_CODE",
+                                       "EMISSIONS_VALUE", "Month", "COMPLEX_ID"]]
+    GOADS_emissions = (GOADS_emissions
+                       .query("POLLUTANT_CODE == 'CH4'")
+                       .assign(Emis_tg = 0.0)
+                       .assign(Emis_tg = lambda df: 9.0718474E-7 * df['EMISSIONS_VALUE']) #convert short tons to Tg
+                       .rename(columns={"COMPLEX_ID": "boem_complex_id"})
+                       .astype({"boem_complex_id": int})
+                       .drop(columns={"POLLUTANT_CODE", "EMISSIONS_VALUE"})
+                       .replace('', np.nan)
+                       .dropna()
+                       .reset_index(drop=True)
+                       )
 
-        # Use ERG Preprocessed data to determine if major or minor and oil or gas
-        ERG_complex_crosswalk = pd.read_excel(ERG_GOADSEmissions_inputfile, sheet_name = "Complex Emissions by Source", usecols = "AJ:AM", nrows = 11143)
+    # Select 2017 data from ERG complex crosswalk
+    ERG_complex_crosswalk_2017 = ERG_complex_crosswalk.copy().query('year == 2017').reset_index(drop=True)
 
-        # add data to map array, for the closest year to 2011
-        year_diff = [abs(x - 2011) for x in year_range]
-        iyear = year_diff.index(min(year_diff))
+    # Join locations, emissions, and complex types together
+    federal_gom_offshore_2017 = (GOADS_emissions
+                                 .set_index("boem_complex_id")
+                                 .join(ERG_complex_crosswalk_2017.set_index("boem_complex_id"))
+                                 .reset_index()
+                                 .astype({"Month": str})
+                                 .assign(state_code='FO')
+                                 .rename(columns={'X_COORDINATE': 'lon', 'Y_COORDINATE': 'lat', 'Month': 'month'})
+                                 )
+    
+    # Correct months to be numeric digits
+    month_to_mm_df = pd.DataFrame(
+        {'month': ['January', 'February', 'March', 'April', 'May', 'June', 'July',
+                   'August', 'September', 'October', 'November', 'December'],
+         'mm': ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
+         })
+    federal_gom_offshore_2017 = (federal_gom_offshore_2017
+                                 .merge(month_to_mm_df, how='left')
+                                 .drop(columns='month')
+                                 .rename(columns={'mm': 'month'})
+                                 )
 
-        #assign oil vs gas by lease/complex ID
-        GOADS_emissions['LEASE_TYPE'] =''
-        GOADS_emissions['MAJOR_STRUC'] =''
-        for istruc in np.arange(0,len(GOADS_emissions)):
-            imatch = np.where(np.logical_and(ERG_complex_crosswalk['BOEM COMPLEX ID.2']==int(GOADS_emissions['BOEM-COMPLEX_ID'][istruc]),\
-                                ERG_complex_crosswalk['Year.2'] == 2011))
-            if np.size(imatch) >0:
-                imatch = imatch[0][0]
-                GOADS_emissions.loc[istruc,'LEASE_TYPE'] = ERG_complex_crosswalk['Oil Gas Defn FINAL.1'][imatch]
-                GOADS_emissions.loc[istruc,'MAJOR_STRUC'] = ERG_complex_crosswalk['Major / Minor.1'][imatch]
-            else:
-                print(istruc, GOADS_emissions['BOEM-COMPLEX_ID'][istruc])
+    federal_gom_offshore_2017_gdf = (
+        gpd.GeoDataFrame(
+            federal_gom_offshore_2017,
+            geometry=gpd.points_from_xy(
+                federal_gom_offshore_2017["lon"],
+                federal_gom_offshore_2017["lat"],
+                crs=4326
+            )
+        )
+        .drop(columns=["lat", "lon"])
+        .loc[:, ["boem_complex_id", "year", "month", "state_code", "Emis_tg", "geometry", "oil_gas_defn"]]
+    )
 
-            # for all gas platforms, match the platform to the emissions
-            if GOADS_emissions['LEASE_TYPE'][istruc] =='Gas':
-                match_platform = np.where(GOADS_locations_Unique.strStateFacilityIdentifier==GOADS_emissions['strStateFacilityIdentifier'][istruc])[0][0]
-                ilat = int((GOADS_locations_Unique['lat'][match_platform] - Lat_low)/Res01)
-                ilon = int((GOADS_locations_Unique['lon'][match_platform] - Lon_left)/Res01)
-                imonth = GOADS_emissions['BOEM-MONTH'][istruc]-1 #dict is 1-12, not 0-11
-                if GOADS_emissions['MAJOR_STRUC'][istruc] =='Major':
-                    Map_GOADSmajor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc]
-                else:
-                    Map_GOADSminor_emissions[ilat,ilon,iyear,imonth] += GOADS_emissions['Emis_tg'][istruc]
-                
-                
-        # sum complexes and emissions for diagnostic
-        majcplx = GOADS_emissions[(GOADS_emissions['MAJOR_STRUC']=='Major')]
-        majcplx = majcplx[majcplx['LEASE_TYPE'] =='Gas']
-        num_majcplx = majcplx['BOEM-COMPLEX_ID'].unique()
-        #print(np.shape(num_majcplx))
-        mincplx = GOADS_emissions[GOADS_emissions['MAJOR_STRUC']=='Minor']
-        mincplx = mincplx[mincplx['LEASE_TYPE'] =='Gas']
-        num_mincplx = mincplx['BOEM-COMPLEX_ID'].unique()
-        #print(np.size(num_mincplx))            
-        del GOADS_emissions
-        print('Number of Major Gas Complexes: ',(np.size(num_majcplx)))
-        print('Emissions (Tg): ',np.sum(Map_GOADSmajor_emissions[:,:,iyear,:]))
-        print('Number of Minor Gas Complexes: ',(np.size(num_mincplx)))
-        print('Emissions (Tg): ',np.sum(Map_GOADSminor_emissions[:,:,iyear,:]))
+    # Separate out ng and oil
+    ng_federal_gom_offshore_2017_gdf = (federal_gom_offshore_2017_gdf
+                                        .query("oil_gas_defn == 'Gas'")
+                                        .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0))
+                                        .drop(columns={'Emis_tg', 'oil_gas_defn'})
+                                        .reset_index(drop=True)
+                                        )
+    oil_federal_gom_offshore_2017_gdf = (federal_gom_offshore_2017_gdf
+                                         .query("oil_gas_defn == 'Oil'")
+                                         .assign(rel_emi=lambda df: df.groupby(["state_code", "year"])['Emis_tg'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0))
+                                         .drop(columns={'Emis_tg', 'oil_gas_defn'})
+                                         .reset_index(drop=True)
+                                         )
 
+    # Build complete proxy (2012-2022)
+    ng_federal_gom_offshore_gdf = gpd.GeoDataFrame()
+    oil_federal_gom_offshore_gdf = gpd.GeoDataFrame()
+    for iyear in years:
+        data_year = federal_gom_offshore_data_years[federal_gom_offshore_data_years['year'] == iyear]['goads_data'].values[0]
+        if data_year == 2011:
+            ng_temp_data = (ng_federal_gom_offshore_2011_gdf
+                            .copy()
+                            .assign(year = iyear)
+                            .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month'])
+                            )
+            oil_temp_data = (oil_federal_gom_offshore_2011_gdf
+                             .copy()
+                             .assign(year = iyear)
+                             .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month'])
+                             )
+        if data_year == 2014:
+            ng_temp_data = (ng_federal_gom_offshore_2014_gdf
+                            .copy()
+                            .assign(year = iyear)
+                            .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month'])
+                            )
+            oil_temp_data = (oil_federal_gom_offshore_2014_gdf
+                             .copy()
+                             .assign(year = iyear)
+                             
+                             )
+        if data_year == 2017:
+            ng_temp_data = (ng_federal_gom_offshore_2017_gdf
+                            .copy()
+                            .assign(year = iyear)
+                            .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month'])
+                            )
+            oil_temp_data = (oil_federal_gom_offshore_2017_gdf
+                             .copy()
+                             .assign(year = iyear)
+                             .assign(year_month=lambda df: df['year'].astype(str)+'_'+df['month'])
+                             )
+        ng_federal_gom_offshore_gdf = pd.concat([ng_federal_gom_offshore_gdf, ng_temp_data])
+        oil_federal_gom_offshore_gdf = pd.concat([oil_federal_gom_offshore_gdf, oil_temp_data])
     
-    # Create proxy gdf
-    proxy_gdf = (
-    gpd.GeoDataFrame(
-        gb_stations_df,
-        geometry=gpd.points_from_xy(
-            gb_stations_df["lon"],
-            gb_stations_df["lat"],
-            crs=4326,
-        ),
-    )
-    .drop(columns=["lat", "lon"])
-    .loc[:, ["facility_name", "state_code", "geometry"]]
-    )
+    ng_federal_gom_offshore_gdf = (ng_federal_gom_offshore_gdf
+                                   .loc[:, ["boem_complex_id", "year", "month", 
+                                            "year_month", "state_code", "geometry", 
+                                            "rel_emi"]]
+                                   .reset_index(drop=True)
+                                   )
+    oil_federal_gom_offshore_gdf = (oil_federal_gom_offshore_gdf
+                                   .loc[:, ["boem_complex_id", "year", "month", 
+                                            "year_month", "state_code", "geometry", 
+                                            "rel_emi"]]                                    
+                                    .reset_index(drop=True)
+                                    )
+
+    ng_federal_gom_offshore_gdf.to_parquet(ng_output_path)
+    oil_federal_gom_offshore_gdf.to_parquet(oil_output_path)
 
-    proxy_gdf.to_parquet(output_path)
     return None

From e6a8aa3f484985638d369e0003f9447e76b2d2d9 Mon Sep 17 00:00:00 2001
From: Hannah Lohman <68960449+haclohman@users.noreply.github.com>
Date: Fri, 20 Dec 2024 13:55:32 -0500
Subject: [PATCH 6/6] Oil and NG exploration and production proxies

Complete code for oil and natural gas exploration and production proxies
---
 .../ng_oil_production_utils.py                |  275 +++++
 .../task_enverus_di_prism_data_processing.py  |  305 +++++
 .../task_ng_all_well_count_proxy.py           |  175 +++
 .../task_ng_all_well_prod_proxy.py            |  176 +++
 .../task_ng_basin_220_prod_proxy.py           |  153 +++
 .../task_ng_basin_395_prod_proxy.py           |  153 +++
 .../task_ng_basin_430_prod_proxy.py           |  153 +++
 .../task_ng_basin_other_prod_proxy.py         |  177 +++
 .../task_ng_conv_well_comp_proxy.py           |  179 +++
 .../task_ng_conv_well_count_proxy.py          |  176 +++
 .../task_ng_drilled_well_proxy.py             |  180 +++
 .../task_ng_hf_well_comp_proxy.py             |  179 +++
 .../task_ng_hf_well_count_proxy.py            |  177 +++
 ...task_ng_oil_federal_gom_offshore_proxy.py} |    4 +-
 .../task_ng_oil_state_gom_offshore_proxy.py   |  278 +++++
 .../task_ng_production_proxy.py               | 1007 -----------------
 .../task_ng_water_prod_proxy.py               |  194 ++++
 .../task_ng_well_blowout_proxy.py             |   75 ++
 .../task_oil_all_well_count_proxy.py          |  176 +++
 .../task_oil_all_well_prod_proxy.py           |  177 +++
 .../task_oil_basin_220_prod_proxy.py          |  154 +++
 .../task_oil_basin_360_prod_proxy.py          |  154 +++
 .../task_oil_basin_395_prod_proxy.py          |  154 +++
 .../task_oil_basin_430_prod_proxy.py          |  154 +++
 .../task_oil_basin_other_prod_proxy.py        |  178 +++
 .../task_oil_conv_well_comp_proxy.py          |  180 +++
 .../task_oil_conv_well_count_proxy.py         |  177 +++
 .../task_oil_drilled_well_proxy.py            |  181 +++
 .../task_oil_hf_well_comp_proxy.py            |  180 +++
 .../task_oil_hf_well_count_proxy.py           |  177 +++
 .../task_oil_water_prod_proxy.py              |  195 ++++
 .../task_oil_well_avg_proxy.py                |   69 ++
 32 files changed, 5313 insertions(+), 1009 deletions(-)
 create mode 100644 gch4i/proxy_processing/ng_oil_production_utils.py
 create mode 100644 gch4i/proxy_processing/task_enverus_di_prism_data_processing.py
 create mode 100644 gch4i/proxy_processing/task_ng_all_well_count_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_all_well_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_basin_220_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_basin_395_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_basin_430_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_basin_other_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_conv_well_comp_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_conv_well_count_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_drilled_well_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_hf_well_comp_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_hf_well_count_proxy.py
 rename gch4i/proxy_processing/{federal_gom_offshore_proxy.py => task_ng_oil_federal_gom_offshore_proxy.py} (99%)
 create mode 100644 gch4i/proxy_processing/task_ng_oil_state_gom_offshore_proxy.py
 delete mode 100644 gch4i/proxy_processing/task_ng_production_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_water_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_ng_well_blowout_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_all_well_count_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_all_well_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_basin_220_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_basin_360_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_basin_395_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_basin_430_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_basin_other_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_conv_well_comp_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_conv_well_count_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_drilled_well_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_hf_well_comp_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_hf_well_count_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_water_prod_proxy.py
 create mode 100644 gch4i/proxy_processing/task_oil_well_avg_proxy.py

diff --git a/gch4i/proxy_processing/ng_oil_production_utils.py b/gch4i/proxy_processing/ng_oil_production_utils.py
new file mode 100644
index 0000000..2b89ed8
--- /dev/null
+++ b/gch4i/proxy_processing/ng_oil_production_utils.py
@@ -0,0 +1,275 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+
+# File Paths
+state_path: Path = global_data_dir_path / "tl_2020_us_state.zip"
+enverus_production_path: Path = sector_data_dir_path / "enverus/production"
+intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs"
+nei_path: Path = sector_data_dir_path / "nei_og"
+
+# State ANSI data
+state_gdf = (
+    gpd.read_file(state_path)
+    .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+    .rename(columns=str.lower)
+    .rename(columns={"stusps": "state_code", "name": "state_name"})
+    .astype({"statefp": int})
+    # get only lower 48 + DC
+    .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+    .reset_index(drop=True)
+    .to_crs(4326)
+)
+
+
+# Function to calculate relative emissions for Enverus data
+def calc_enverus_rel_emi(df):
+    df['rel_emi'] = df.groupby(["state_code", "year"])['proxy_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
+    df = df.drop(columns='proxy_data')
+    return df
+
+
+# function to format proxy data into geodataframes
+def enverus_df_to_gdf(df):
+    gdf = (
+        gpd.GeoDataFrame(
+            df,
+            geometry=gpd.points_from_xy(
+                df["longitude"],
+                df["latitude"],
+                crs=4326
+            )
+        )
+        .drop(columns=["latitude", "longitude"])
+        .loc[:, ["year", "year_month", "state_code", "rel_emi", "geometry"]]
+    )
+    return gdf
+
+
+# NEI FIPS codes
+fips_codes_df = pd.DataFrame({'state_code': ['IL', 'IN', 'KS', 'OK', 'PA', 'WV'],
+                              'fips_code': ['17', '18', '20', '40', '42', '54']})
+
+# NEI data year assignments
+# All years use the data affiliated with their year except the following exceptions:
+    # 2012: use 2011 data
+    # 2013: use 2014 data
+    # 2015: use 2014 data
+    # 2016: use 2017 data
+nei_data_years = pd.DataFrame(
+    {'year': [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022],
+     'nei_data': [2011, 2014, 2014, 2014, 2017, 2017, 2018, 2019, 2020, 2021, 2022]
+     })
+
+# NEI text file and shapefile names:
+# Natural Gas Well Counts
+ng_well_count_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_698_NOFILL.txt', 'USA_698_NOFILL.txt', 'USA_698_NOFILL.txt',
+                  'GAS_WELLS', 'GAS_WELLS', 'GAS_WELL', '_698', 'GasWells'],
+    })
+# Natural Gas Well-Level Production Volumes
+ng_gas_prod_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_696_NOFILL.txt', 'USA_696_NOFILL.txt', 'USA_696_NOFILL.txt',
+                  'GAS_PRODUCTION', 'GAS_PRODUCTION', 'GAS_PRODUCTION', '_696', 'GasProduction'],
+    })
+# Natural Gas Water Production Volumes
+ng_water_prod_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_6832_NOFILL.txt', 'USA_6832_NOFILL.txt', 'USA_6832_NOFILL.txt',
+                  'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', '_6832', 'ProducedWaterGasWells'],
+    })
+# Natural Gas Well Completions
+ng_comp_count_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_678_NOFILL.txt', 'USA_678_NOFILL.txt', 'USA_678_NOFILL.txt',
+                  'COMPLETIONS_GAS', 'COMPLETIONS_GAS', 'COMPLETIONS_GAS', '_678', 'GasWellCompletions'],
+    })
+# Natural Gas Drilled Gas Wells
+ng_spud_count_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_671_NOFILL.txt', 'USA_671_NOFILL.txt', 'USA_671_NOFILL.txt',
+                  'SPUD_GAS', 'SPUD_GAS', 'SPUD_GAS', '_671', 'SpudCountGasWells'],
+    })
+# Oil Well Counts
+oil_well_count_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_695_NOFILL.txt', 'USA_695_NOFILL.txt', 'USA_695_NOFILL.txt',
+                  'OIL_WELLS', 'OIL_WELLS', 'OIL_WELL', '_695', 'OILWells'],
+    })
+# Oil Well-Level Production Volumes
+oil_oil_prod_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_694_NOFILL.txt', 'USA_694_NOFILL.txt', 'USA_694_NOFILL.txt',
+                  'OIL_PRODUCTION', 'OIL_PRODUCTION', 'OIL_PRODUCTION', '_694', 'OilProduction'],
+    })
+# Oil Water Production Volumes
+oil_water_prod_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_6833_NOFILL.txt', 'USA_6833_NOFILL.txt', 'USA_6833_NOFILL.txt',
+                  'PRODUCED_WATER_OIL', 'PRODUCED_WATER_OIL', 'PRODUCED_WATER_OIL', '_6833', 'ProducedWaterOilWells'],
+    })
+# Oil Well Completions
+oil_comp_count_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_685_NOFILL.txt', 'USA_685_NOFILL.txt', 'USA_685_NOFILL.txt',
+                  'COMPLETIONS_OIL', 'COMPLETIONS_OIL', 'COMPLETIONS_OIL', '_685', 'OilWellCompletions'],
+    })
+# Oil Drilled Gas Wells
+oil_spud_count_file_names = pd.DataFrame({
+    'data_year': [2011, 2014, 2017, 2018, 2019, 2020, 2021, 2022],
+    'file_name': ['USA_681_NOFILL.txt', 'USA_681_NOFILL.txt', 'USA_681_NOFILL.txt',
+                  'SPUD_OIL', 'SPUD_OIL', 'SPUD_OIL', '_681', 'SpudCountOilWells'],
+    })
+
+
+# Function to get the specific file name for a given year
+def get_nei_file_name(nei_data_year, nei_file_names):
+    nei_file_name = nei_file_names[nei_file_names['data_year'] == nei_data_year]['file_name'].values[0]
+    return nei_file_name
+
+
+# Function to get raw NEI textfile and shapefile data for the specific proxy of interest
+def get_raw_NEI_data(ghgi_year, data_year, file_name):
+    if data_year <= 2017:
+        # NEI textfile data (data_year <= 2017) (2011, 2014, 2016, 2017)
+        nei_textfile_name = f"CONUS_SA_FILES_{data_year}/{file_name}"
+        nei_textfile_path = os.path.join(nei_path, nei_textfile_name)
+        data_temp = pd.read_csv(nei_textfile_path, sep='\t', skiprows = 25)
+        data_temp = data_temp.drop(["!"], axis=1)
+        data_temp.columns = ['Code','FIPS','COL','ROW','Frac','Abs','FIPS_Total','FIPS_Running_Sum']
+        data_temp = data_temp.astype({"FIPS": str})
+        # if water production data (gas: 6832, oil: 6833)
+        if file_name == 'USA_6832_NOFILL.txt' or file_name == 'USA_6833_NOFILL.txt':
+            if data_year < 2016:
+                data_temp = (data_temp
+                            # query states: IL, IN, KS, OK, PA, WV
+                            .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42') | FIPS.str.startswith('54')")
+                            .reset_index(drop=True)
+                            )
+                colmax = data_temp['COL'].max()
+                colmin = data_temp['COL'].min()
+                rowmax = data_temp['ROW'].max()
+                rowmin = data_temp['ROW'].min()
+            else:
+                data_temp = (data_temp
+                            # query states: IL, IN, KS, OK, PA
+                            .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42')")
+                            .reset_index(drop=True)
+                            )
+                colmax = data_temp['COL'].max()
+                colmin = data_temp['COL'].min()
+                rowmax = data_temp['ROW'].max()
+                rowmin = data_temp['ROW'].min()
+        # non-water production proxies (IL, IN)
+        else:
+            data_temp = (data_temp
+                        # query states: IL, IN
+                        .query("FIPS.str.startswith('17') | FIPS.str.startswith('18')")
+                        .reset_index(drop=True)
+                        )
+            colmax = data_temp['COL'].max()
+            colmin = data_temp['COL'].min()
+            rowmax = data_temp['ROW'].max()
+            rowmin = data_temp['ROW'].min()
+        # NEI reference grid shapefile with lat/lon locations
+        nei_reference_grid_path = os.path.join(nei_path, "NEI_Reference_Grid_LCC_to_WGS84_latlon.shp")
+        nei_reference_grid = (gpd.read_file(nei_reference_grid_path)
+                            .to_crs(4326))
+        nei_reference_grid = (nei_reference_grid
+                            .assign(cellid_column = nei_reference_grid.cellid.astype(str).str[0:4].astype(int))
+                            .assign(cellid_row = nei_reference_grid.cellid.astype(str).str[5:].astype(int))
+                            .query(f"cellid_column <= {colmax} & cellid_column >= {colmin}")
+                            .query(f"cellid_row <= {rowmax} & cellid_row >= {rowmin}")
+                            .reset_index(drop=True)
+                            )
+        # Match lat/lon locations from reference grid to nei data
+        for idx in np.arange(0,len(data_temp)):
+            # Add in lat/lon
+            icol = data_temp['COL'][idx]
+            irow = data_temp['ROW'][idx]
+            match = np.where((icol == nei_reference_grid.loc[:,'cellid_column']) & (irow == nei_reference_grid.loc[:,'cellid_row']))[0][0]
+            match = int(match)
+            # data_temp.loc[idx,'Lat'] = nei_reference_grid.loc[match, 'Latitude']
+            # data_temp.loc[idx,'Lon'] = nei_reference_grid.loc[match, 'Longitude']
+            data_temp.loc[idx,'geometry'] = nei_reference_grid.loc[match, 'geometry']
+            # Add in state_code
+            ifips = data_temp.loc[idx,'FIPS'][0:2]
+            data_temp.loc[idx,'state_code'] = fips_codes_df.loc[np.where(ifips == fips_codes_df.loc[:, 'fips_code'])[0][0],'state_code']
+        data_temp = data_temp[['state_code', 'Abs', 'geometry']]
+        data_temp = data_temp.rename(columns={'Abs':'activity_data'})
+    
+    else:
+        # NEI shapefile data (data_year > 2017) (2018, 2019, 2021, 2022)
+        state_geometries = state_gdf[["state_code","geometry"]]
+        nei_file_name = f"CONUS_SA_FILES_{data_year}"
+        nei_file_path = os.path.join(nei_path, nei_file_name)
+        data_temp = gpd.read_file(nei_file_path, layer=file_name)
+        data_temp = data_temp.to_crs(4326)
+        data_temp = gpd.tools.sjoin(data_temp, state_gdf, how="left")
+
+        # water production data (IL, IN, KS, OK, PA)
+        if file_name == 'PRODUCED_WATER_GAS' or file_name == '_6832' or file_name == 'ProducedWaterGasWells':
+            states_to_query = ['IL', 'IN', 'KS', 'OK', 'PA']
+        # non-water production proxies (IL, IN)
+        else:
+            states_to_query = ['IL', 'IN']
+        
+        # query relevant states
+        data_temp = data_temp.query('state_code.isin(@states_to_query)')
+
+        # grab activity data depending on column name (changes by year)
+        if data_year == 2018 or data_year == 2019 or data_year == 2020:
+            data_temp = data_temp[['state_code', 'ACTIVITY', 'geometry']]
+            data_temp = data_temp.rename(columns={'ACTIVITY':'activity_data'})            
+        if data_year == 2021:
+            data_temp = data_temp[['state_code', 'GRID_AC', 'geometry']]
+            data_temp = data_temp.rename(columns={'GRID_AC':'activity_data'})
+        if data_year == 2022:
+            data_temp = data_temp[['state_code', 'GRID_ACTIV', 'geometry']]
+            data_temp = data_temp.rename(columns={'GRID_ACTIV':'activity_data'})
+    
+    # convert activity data to relative emissions (idata / sum(state data))
+    data_temp['rel_emi'] = data_temp.groupby(["state_code"])['activity_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
+    monthly_data_temp = data_temp.copy()
+    monthly_data_temp['rel_emi'] = monthly_data_temp['rel_emi'] * 1/12
+    monthly_data_temp = monthly_data_temp.drop(columns='activity_data')
+
+    # convert proxy data to monthly (assume 1/12 of annual proxy is assigned to each month)
+    nei_proxy_data = pd.DataFrame()
+    for imonth in range(1, 13):
+        imonth_str = f"{imonth:02}"  # convert to 2-digit months
+        data_temp_imonth = monthly_data_temp.copy()
+        data_temp_imonth = data_temp_imonth.assign(year_month=str(ghgi_year)+'-'+imonth_str)
+        nei_proxy_data = pd.concat([nei_proxy_data, data_temp_imonth])
+    nei_proxy_data = nei_proxy_data.assign(year=ghgi_year)
+    nei_proxy_data = (nei_proxy_data[['year', 'year_month', 'state_code', 'rel_emi', 'geometry']]
+                        .reset_index(drop=True)
+                        )
+    return nei_proxy_data
diff --git a/gch4i/proxy_processing/task_enverus_di_prism_data_processing.py b/gch4i/proxy_processing/task_enverus_di_prism_data_processing.py
new file mode 100644
index 0000000..de58eeb
--- /dev/null
+++ b/gch4i/proxy_processing/task_enverus_di_prism_data_processing.py
@@ -0,0 +1,305 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+
+from pyarrow import parquet
+import pandas as pd
+import geopandas as gpd
+import numpy as np
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    global_data_dir_path,
+    sector_data_dir_path,
+    min_year,
+    max_year,
+    years,
+)
+
+# %%
+@mark.persist
+@task(id="enverus_di_prism_data_processing")
+def task_get_enverus_di_prism_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Well and Production Data (from Enverus)
+    # Read In & Combine Each Year of Prism & DI Monthly Data (from Enverus)
+
+    # Data come from Enverus, both Drilling Info and Prism
+    # The reason 2 datasets are used is because Prism does not include all states
+    # So remaining states, or those with more DI coverage are taken from DI
+
+    # Read In and Format the Prism and DI data
+    # 1. Read Data
+    # 2. Drop unused columns, rename columns to match between DI and Prism
+    # 3. Combine DI and Prism into one data array
+    # 4. Calculate annual cummulate production totals
+    # 5. Save the data as a year-specific variable
+
+    # Based on ERGs logic, active wells are determined based on their production levels and not producing status
+    Enverus_data_dict = {}
+    DI_data_dict = {}
+    Prism_data_dict = {}
+    for iyear in years:
+        #DI data
+        DI_file_name = f"didsk_monthly_{iyear}.csv"
+        DI_file_path = os.path.join(enverus_production_path, DI_file_name)
+        DI_data = (pd.read_csv(
+            DI_file_path,
+            usecols=['WELL_COUNT_ID','STATE','COUNTY','BASIN','AAPG_CODE_ERG',
+                    'LATITUDE','LONGITUDE','STATUS','COMPDATE',
+                    'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR',
+                    'GOR_QUAL','PROD_FLAG','PRODYEAR',
+                    'LIQ_01','GAS_01','WTR_01','LIQ_02','GAS_02','WTR_02',
+                    'LIQ_03','GAS_03','WTR_03','LIQ_04','GAS_04','WTR_04',
+                    'LIQ_05','GAS_05','WTR_05','LIQ_06','GAS_06','WTR_06',
+                    'LIQ_07','GAS_07','WTR_07','LIQ_08','GAS_08','WTR_08',
+                    'LIQ_09','GAS_09','WTR_09','LIQ_10','GAS_10','WTR_10',
+                    'LIQ_11','GAS_11','WTR_11','LIQ_12','GAS_12','WTR_12',],
+            dtype={7:'str'})
+            .rename(columns={'WELL_COUNT_ID':'WELL_COUNT','STATE':'STATE_CODE',
+                            'STATUS':'PRODUCING_STATUS',
+                            'LIQ_01':'OILPROD_01','GAS_01':'GASPROD_01','WTR_01':'WATERPROD_01',
+                            'LIQ_02':'OILPROD_02','GAS_02':'GASPROD_02','WTR_02':'WATERPROD_02',
+                            'LIQ_03':'OILPROD_03','GAS_03':'GASPROD_03','WTR_03':'WATERPROD_03',
+                            'LIQ_04':'OILPROD_04','GAS_04':'GASPROD_04','WTR_04':'WATERPROD_04',
+                            'LIQ_05':'OILPROD_05','GAS_05':'GASPROD_05','WTR_05':'WATERPROD_05',
+                            'LIQ_06':'OILPROD_06','GAS_06':'GASPROD_06','WTR_06':'WATERPROD_06',
+                            'LIQ_07':'OILPROD_07','GAS_07':'GASPROD_07','WTR_07':'WATERPROD_07',
+                            'LIQ_08':'OILPROD_08','GAS_08':'GASPROD_08','WTR_08':'WATERPROD_08',
+                            'LIQ_09':'OILPROD_09','GAS_09':'GASPROD_09','WTR_09':'WATERPROD_09',
+                            'LIQ_10':'OILPROD_10','GAS_10':'GASPROD_10','WTR_10':'WATERPROD_10',
+                            'LIQ_11':'OILPROD_11','GAS_11':'GASPROD_11','WTR_11':'WATERPROD_11',
+                            'LIQ_12':'OILPROD_12','GAS_12':'GASPROD_12','WTR_12':'WATERPROD_12',})
+            .assign(WELL_COUNT=1)  # TODO: Check to see if this should actually be set to 1
+            )
+        # Format completion date (YYYY-MM)
+        for iwell in range(0,len(DI_data)):
+            comp_date = str(DI_data.loc[iwell, 'COMPDATE'])
+            if comp_date == 'NaN':
+                comp_year_month = 'NaN'
+            elif comp_date == 'nan':
+                comp_year_month = 'NaN'
+            else:  # date format M/DD/YYYY
+                comp_month = f"{int(comp_date.split('/')[0]):02}"
+                comp_year = f"{int(comp_date.split('/')[2])}"
+                comp_year_month = str(comp_year)+'-'+str(comp_month)
+            DI_data.loc[iwell, 'comp_year_month'] = comp_year_month
+        # Format spud date (YYYY)
+        for iwell in range(0,len(DI_data)):
+            spud_date = str(DI_data.loc[iwell, 'SPUDDATE'])
+            if spud_date == 'NaN':
+                spud_year = 'NaN'
+            elif spud_date == 'nan':
+                spud_year = 'NaN'
+            else:  # date format M/DD/YYYY
+                spud_year = f"{int(spud_date.split('/')[2])}"
+                spud_year = str(spud_year)
+            DI_data.loc[iwell, 'spud_year'] = spud_year
+        # Format first production date (YYYY)
+        for iwell in range(0,len(DI_data)):
+            first_prod_date = str(DI_data.loc[iwell, 'FIRSTPRODDATE'])
+            if first_prod_date == 'NaN':
+                first_prod_year = 'NaN'
+            elif first_prod_date == 'nan':
+                first_prod_year = 'NaN'
+            else:  # date format M/DD/YYYY
+                first_prod_year = f"{int(first_prod_date.split('/')[2])}"
+                first_prod_year = str(first_prod_year)
+            DI_data.loc[iwell, 'first_prod_year'] = first_prod_year
+        DI_data_dict[f'{iyear}'] = DI_data
+
+        # Prism Data
+        Prism_file_name = f"prism_monthly_{iyear}.csv"
+        Prism_file_path = os.path.join(enverus_production_path, Prism_file_name)
+        Prism_data = (pd.read_csv(
+            Prism_file_path,
+            usecols=['STATE','COUNTY','ENVBASIN','AAPG_CODE_ERG',
+                    'LATITUDE','LONGITUDE','ENVWELLSTATUS','COMPLETIONDATE',
+                    'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR',
+                    'GOR_QUAL','PROD_FLAG','PRODYEAR',
+                    'LIQUIDSPROD_BBL_01','GASPROD_MCF_01','WATERPROD_BBL_01',
+                    'LIQUIDSPROD_BBL_02','GASPROD_MCF_02','WATERPROD_BBL_02',
+                    'LIQUIDSPROD_BBL_03','GASPROD_MCF_03','WATERPROD_BBL_03',
+                    'LIQUIDSPROD_BBL_04','GASPROD_MCF_04','WATERPROD_BBL_04',
+                    'LIQUIDSPROD_BBL_05','GASPROD_MCF_05','WATERPROD_BBL_05',
+                    'LIQUIDSPROD_BBL_06','GASPROD_MCF_06','WATERPROD_BBL_06',
+                    'LIQUIDSPROD_BBL_07','GASPROD_MCF_07','WATERPROD_BBL_07',
+                    'LIQUIDSPROD_BBL_08','GASPROD_MCF_08','WATERPROD_BBL_08',
+                    'LIQUIDSPROD_BBL_09','GASPROD_MCF_09','WATERPROD_BBL_09',
+                    'LIQUIDSPROD_BBL_10','GASPROD_MCF_10','WATERPROD_BBL_10',
+                    'LIQUIDSPROD_BBL_11','GASPROD_MCF_11','WATERPROD_BBL_11',
+                    'LIQUIDSPROD_BBL_12','GASPROD_MCF_12','WATERPROD_BBL_12',],
+            dtype={7:'str'})
+            .rename(columns={'STATE':'STATE_CODE', 'ENVBASIN':'BASIN',
+                            'ENVWELLSTATUS':'PRODUCING_STATUS',
+                            'COMPLETIONDATE':'COMPDATE',
+                            'LIQUIDSPROD_BBL_01':'OILPROD_01','GASPROD_MCF_01':'GASPROD_01','WATERPROD_BBL_01':'WATERPROD_01',
+                            'LIQUIDSPROD_BBL_02':'OILPROD_02','GASPROD_MCF_02':'GASPROD_02','WATERPROD_BBL_02':'WATERPROD_02',
+                            'LIQUIDSPROD_BBL_03':'OILPROD_03','GASPROD_MCF_03':'GASPROD_03','WATERPROD_BBL_03':'WATERPROD_03',
+                            'LIQUIDSPROD_BBL_04':'OILPROD_04','GASPROD_MCF_04':'GASPROD_04','WATERPROD_BBL_04':'WATERPROD_04',
+                            'LIQUIDSPROD_BBL_05':'OILPROD_05','GASPROD_MCF_05':'GASPROD_05','WATERPROD_BBL_05':'WATERPROD_05',
+                            'LIQUIDSPROD_BBL_06':'OILPROD_06','GASPROD_MCF_06':'GASPROD_06','WATERPROD_BBL_06':'WATERPROD_06',
+                            'LIQUIDSPROD_BBL_07':'OILPROD_07','GASPROD_MCF_07':'GASPROD_07','WATERPROD_BBL_07':'WATERPROD_07',
+                            'LIQUIDSPROD_BBL_08':'OILPROD_08','GASPROD_MCF_08':'GASPROD_08','WATERPROD_BBL_08':'WATERPROD_08',
+                            'LIQUIDSPROD_BBL_09':'OILPROD_09','GASPROD_MCF_09':'GASPROD_09','WATERPROD_BBL_09':'WATERPROD_09',
+                            'LIQUIDSPROD_BBL_10':'OILPROD_10','GASPROD_MCF_10':'GASPROD_10','WATERPROD_BBL_10':'WATERPROD_10',
+                            'LIQUIDSPROD_BBL_11':'OILPROD_11','GASPROD_MCF_11':'GASPROD_11','WATERPROD_BBL_11':'WATERPROD_11',
+                            'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12',})
+            .assign(WELL_COUNT=1)
+            )
+        # Format completion date (YYYY-MM)
+        for iwell in range(0,len(Prism_data)):
+            comp_date = str(Prism_data.loc[iwell, 'COMPDATE'])
+            if comp_date == 'NaN':
+                comp_year_month = 'NaN'
+            elif comp_date == 'nan':
+                comp_year_month = 'NaN'
+            else:  # date format YYYY-MM-DD
+                comp_month = f"{int(comp_date.split('-')[1]):02}"
+                comp_year = f"{int(comp_date.split('-')[0])}"
+                comp_year_month = str(comp_year)+'-'+str(comp_month)
+            Prism_data.loc[iwell, 'comp_year_month'] = comp_year_month
+        # Format spud date (YYYY)
+        for iwell in range(0,len(Prism_data)):
+            spud_date = str(Prism_data.loc[iwell, 'SPUDDATE'])
+            if spud_date == 'NaN':
+                spud_year = 'NaN'
+            elif spud_date == 'nan':
+                spud_year = 'NaN'
+            else:  # date format YYYY-MM-DD
+                spud_year = f"{int(spud_date.split('-')[0])}"
+                spud_year = str(spud_year)
+            Prism_data.loc[iwell, 'spud_year'] = spud_year
+        # Format first production date (YYYY)
+        for iwell in range(0,len(Prism_data)):
+            first_prod_date = str(Prism_data.loc[iwell, 'FIRSTPRODDATE'])
+            if first_prod_date == 'NaN':
+                first_prod_year = 'NaN'
+            elif first_prod_date == 'nan':
+                first_prod_year = 'NaN'
+            else:  # date format YYYY-MM-DD
+                first_prod_year = f"{int(first_prod_date.split('-')[0])}"
+                first_prod_year = str(first_prod_year)
+            Prism_data.loc[iwell, 'first_prod_year'] = first_prod_year
+        Prism_data_dict[f'{iyear}'] = Prism_data
+        
+        # Combine into one array with common column names, replace nans with zeros, and sum annual production
+        Enverus_data = pd.concat([DI_data, Prism_data], ignore_index=True)
+        Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')].fillna(0)
+        Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')].fillna(0)
+        Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')].fillna(0)
+
+        # Calculate cummulative annual production totals for Gas, Oil, Water
+        Enverus_data['CUM_GAS'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('GASPROD_')].sum(1)
+        Enverus_data['CUM_OIL'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('OILPROD_')].sum(1)
+        Enverus_data['CUM_WATER'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('WATERPROD_')].sum(1)
+                
+        # Save out the data for that year
+        Enverus_data_dict[f'{iyear}'] = Enverus_data
+
+        del Prism_data
+        del DI_data #save memory space 
+        
+    # Correct Enverus Data for Select States
+
+    # 1) Read In Coverage Table from State Well Counts File from ERG
+    # (specifies the first year with bad data and which years need to be corrected; 
+    # all years including and after the first bad year of data need to be corrected)
+
+    ERG_StateWellCounts_LastGoodDataYear = (pd.read_excel(
+        enverus_well_counts_path,
+        sheet_name = "2021 - Coverage",
+        usecols = {"State","Last Good Year"},
+        skiprows = 2,
+        nrows = 40)
+        )
+
+    # 2) Loops through the each state and year in Enverus to determine if the data for that particualar year needs to 
+    # be corrected. At the moment, the only corrections ERG makes to the data is to use the prior year of data if there
+    # is no new Enverus data reportd for that state. If a particular state is not included for any years in the Enverus
+    # dataset, then a row of zeros is added to the Enverus table for that year.
+        
+    for istate in np.arange(0,len(state_gdf)):
+        correctdata =0
+        istate_code = state_gdf['state_code'][istate]
+        lastgoodyear = ERG_StateWellCounts_LastGoodDataYear['Last Good Year'][ERG_StateWellCounts_LastGoodDataYear['State'] == istate_code].values
+        if lastgoodyear  == max_year:
+            lastgoodyear = max_year+5 #if state isn't included in correction list, don't correct any data
+        
+        for iyear in years:
+            enverus_data_temp= Enverus_data_dict[f'{iyear}'].copy()
+            state_list = np.unique(enverus_data_temp['STATE_CODE'])
+            if istate_code in state_list:
+                inlist =1
+            else:
+                inlist = 0
+            if inlist ==1 or correctdata==1: #if the state is included in Enverus data, or had data for at least one good year
+                #if first year, correctdata will be zero, but inlist will also be zero if no Enverus data
+                #check to see whether corrections are necessary for the given year/state
+                if iyear == (lastgoodyear):
+                    print(istate_code,iyear,'last good year')
+                    # This is the last year of good data. Do not correct the data but save
+                    # but so that this data can be used for all following years for that state
+                    temp_data = enverus_data_temp[enverus_data_temp['STATE_CODE'] == istate_code]
+                    correctdata=1
+                elif iyear > lastgoodyear: 
+                    print(istate_code,iyear)
+                    # correct data for all years equal to and after the first bad year (remove old data first if necessary)
+                    if inlist == 1:
+                        enverus_data_temp = enverus_data_temp[enverus_data_temp['STATE_CODE'] != istate_code]
+                    enverus_data_temp = pd.concat([enverus_data_temp,temp_data],ignore_index=True)
+                    print(istate_code +' data for ' +str(iyear) +' were corrected with '+str(lastgoodyear)+' data')
+                else:
+                    no_corrections =1
+                    
+            if inlist==0 and correctdata==0:
+            # if there is no Enverus data for a given state, and there was no good data, add a row with default values
+                print(istate_code +' has no Enverus data in the year ' +str(iyear))
+                
+            # save that year of Enverus data
+            enverus_data_temp.reset_index(drop=True,inplace=True)
+            Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy()
+            tempoutput_filename = f'formatted_raw_enverus_tempoutput_{iyear}.csv'
+            tempoutput_filepath = os.path.join(intermediate_outputs_path, tempoutput_filename)
+            enverus_data_temp.to_csv(tempoutput_filepath, index=False)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_all_well_count_proxy.py b/gch4i/proxy_processing/task_ng_all_well_count_proxy.py
new file mode 100644
index 0000000..c44f9fd
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_all_well_count_proxy.py
@@ -0,0 +1,175 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    ng_well_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="ng_all_well_count_proxy")
+def task_get_ng_all_well_count_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    all_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_count_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    all_well_count_df = pd.DataFrame()  # Active gas well (conventional + HF) counts in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # All Gas Well Count
+            all_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']]
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .rename(columns={"well_count":"proxy_data"})
+                                    .reset_index(drop=True)
+                                    )
+            all_well_count_df = pd.concat([all_well_count_df,all_well_count_imonth])
+
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del all_well_count_imonth
+
+    # Calculate relative emissions and convert to a geodataframe 
+    all_well_count_df = calc_enverus_rel_emi(all_well_count_df)
+    all_well_count_df = enverus_df_to_gdf(all_well_count_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, ng_well_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    all_well_count_df = pd.concat([all_well_count_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = all_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    all_well_count_df = all_well_count_df.astype({'year':str})
+    all_well_count_df.to_parquet(all_well_count_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_all_well_prod_proxy.py b/gch4i/proxy_processing/task_ng_all_well_prod_proxy.py
new file mode 100644
index 0000000..58273a9
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_all_well_prod_proxy.py
@@ -0,0 +1,176 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    ng_gas_prod_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="ng_all_well_prod_proxy")
+def task_get_ng_all_well_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    all_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    all_well_prod_df = pd.DataFrame()  # Active gas well (conventional + HF) gas production in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # All Gas Well Gas Production
+            all_well_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]]
+                                   .assign(proxy_data=lambda df: df[gas_prod_str])
+                                   .drop(columns=[gas_prod_str])
+                                   .rename(columns=lambda x: str(x).lower())
+                                   .reset_index(drop=True)
+                                   )
+            all_well_prod_df = pd.concat([all_well_prod_df,all_well_prod_imonth])
+
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del all_well_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe   
+    all_well_prod_df = calc_enverus_rel_emi(all_well_prod_df)    
+    all_well_prod_df = enverus_df_to_gdf(all_well_prod_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Gas Production
+        ifile_name = get_nei_file_name(nei_data_year, ng_gas_prod_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    all_well_prod_df = pd.concat([all_well_prod_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = all_well_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    all_well_prod_df = all_well_prod_df.astype({'year':str})
+    all_well_prod_df.to_parquet(all_well_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_basin_220_prod_proxy.py b/gch4i/proxy_processing/task_ng_basin_220_prod_proxy.py
new file mode 100644
index 0000000..fc4020c
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_basin_220_prod_proxy.py
@@ -0,0 +1,153 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+)
+
+# %%
+@mark.persist
+@task(id="ng_basin_220_prod_proxy")
+def task_get_ng_basin_220_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    basin_220_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_220_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    basin_220_prod_df = pd.DataFrame()  # Gas well gas production in Basin 220 in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Basin 220 Gas Well Gas Production
+            basin_220_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
+                                     .query("AAPG_CODE_ERG == '220'")
+                                     .assign(proxy_data=lambda df: df[gas_prod_str])
+                                     .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .reset_index(drop=True)
+                                     )
+            basin_220_prod_df = pd.concat([basin_220_prod_df,basin_220_prod_imonth])
+
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del basin_220_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe 
+    basin_220_prod_df = calc_enverus_rel_emi(basin_220_prod_df)
+    basin_220_prod_df = enverus_df_to_gdf(basin_220_prod_df)
+
+    # NEI Data:
+    # No addition of NEI data because IL and IN are not in this basin. We are adding
+    # them to the "other" basin.
+    
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = basin_220_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    basin_220_prod_df = basin_220_prod_df.astype({'year':str})
+    basin_220_prod_df.to_parquet(basin_220_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_basin_395_prod_proxy.py b/gch4i/proxy_processing/task_ng_basin_395_prod_proxy.py
new file mode 100644
index 0000000..67a9c39
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_basin_395_prod_proxy.py
@@ -0,0 +1,153 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+)
+
+# %%
+@mark.persist
+@task(id="ng_basin_395_prod_proxy")
+def task_get_ng_basin_395_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    basin_395_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_395_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    basin_395_prod_df = pd.DataFrame()  # Gas well gas production in Basin 395 in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Basin 395 Gas Well Gas Production
+            basin_395_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
+                                     .query("AAPG_CODE_ERG == '395'")
+                                     .assign(proxy_data=lambda df: df[gas_prod_str])
+                                     .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .reset_index(drop=True)
+                                     )
+            basin_395_prod_df = pd.concat([basin_395_prod_df,basin_395_prod_imonth])
+
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del basin_395_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe   
+    basin_395_prod_df = calc_enverus_rel_emi(basin_395_prod_df)
+    basin_395_prod_df = enverus_df_to_gdf(basin_395_prod_df)
+
+    # NEI Data:
+    # No addition of NEI data because IL and IN are not in this basin. We are adding
+    # them to the "other" basin.
+    
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = basin_395_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    basin_395_prod_df = basin_395_prod_df.astype({'year':str})
+    basin_395_prod_df.to_parquet(basin_395_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_basin_430_prod_proxy.py b/gch4i/proxy_processing/task_ng_basin_430_prod_proxy.py
new file mode 100644
index 0000000..f8d082a
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_basin_430_prod_proxy.py
@@ -0,0 +1,153 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+)
+
+# %%
+@mark.persist
+@task(id="ng_basin_430_prod_proxy")
+def task_get_ng_basin_430_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    basin_430_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_430_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    basin_430_prod_df = pd.DataFrame()  # Gas well gas production in Basin 430 in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Basin 430 Gas Well Gas Production
+            basin_430_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
+                                     .query("AAPG_CODE_ERG == '430'")
+                                     .assign(proxy_data=lambda df: df[gas_prod_str])
+                                     .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .reset_index(drop=True)
+                                     )
+            basin_430_prod_df = pd.concat([basin_430_prod_df,basin_430_prod_imonth])
+
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del basin_430_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe    
+    basin_430_prod_df = calc_enverus_rel_emi(basin_430_prod_df)    
+    basin_430_prod_df = enverus_df_to_gdf(basin_430_prod_df)
+
+    # NEI Data:
+    # No addition of NEI data because IL and IN are not in this basin. We are adding
+    # them to the "other" basin.
+    
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = basin_430_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    basin_430_prod_df = basin_430_prod_df.astype({'year':str})
+    basin_430_prod_df.to_parquet(basin_430_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_basin_other_prod_proxy.py b/gch4i/proxy_processing/task_ng_basin_other_prod_proxy.py
new file mode 100644
index 0000000..2bb122d
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_basin_other_prod_proxy.py
@@ -0,0 +1,177 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    ng_gas_prod_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="ng_basin_other_prod_proxy")
+def task_get_ng_basin_other_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    basin_other_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_other_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    basin_other_prod_df = pd.DataFrame()  # Gas well gas production in Other Basins in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # "Other" Basin Gas Production
+            basin_other_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
+                                       .query("AAPG_CODE_ERG != '220' & AAPG_CODE_ERG != '395' & AAPG_CODE_ERG != '430'")
+                                       .assign(proxy_data=lambda df: df[gas_prod_str])
+                                       .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
+                                       .rename(columns=lambda x: str(x).lower())
+                                       .reset_index(drop=True)
+                                       )
+            basin_other_prod_df = pd.concat([basin_other_prod_df,basin_other_prod_imonth])
+
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del basin_other_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe  
+    basin_other_prod_df = calc_enverus_rel_emi(basin_other_prod_df)
+    basin_other_prod_df = enverus_df_to_gdf(basin_other_prod_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Gas Production
+        ifile_name = get_nei_file_name(nei_data_year, ng_gas_prod_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    basin_other_prod_df = pd.concat([basin_other_prod_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = basin_other_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    basin_other_prod_df = basin_other_prod_df.astype({'year':str})
+    basin_other_prod_df.to_parquet(basin_other_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_conv_well_comp_proxy.py b/gch4i/proxy_processing/task_ng_conv_well_comp_proxy.py
new file mode 100644
index 0000000..78f41e5
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_conv_well_comp_proxy.py
@@ -0,0 +1,179 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    ng_comp_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="ng_conv_well_comp_proxy")
+def task_get_ng_conv_well_comp_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    conv_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_comp_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    conv_well_comp_df = pd.DataFrame()  # Conventional well completions
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str, 'comp_year_month': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str, "comp_year_month": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Conventional Well Completions
+            conv_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']]
+                                    .query("HF != 'Y'")
+                                    .drop(columns=["HF"])
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .rename(columns={"well_count":"proxy_data"})
+                                    .query(f"comp_year_month == '{year_month_str}'")
+                                    .drop(columns=["comp_year_month"])
+                                    .reset_index(drop=True)
+                                    )
+            conv_well_comp_df = pd.concat([conv_well_comp_df,conv_well_comp_imonth])
+    
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del conv_well_comp_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    conv_well_comp_df  = calc_enverus_rel_emi(conv_well_comp_df )
+    conv_well_comp_df  = enverus_df_to_gdf(conv_well_comp_df )
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, ng_comp_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    conv_well_comp_df = pd.concat([conv_well_comp_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = conv_well_comp_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    conv_well_comp_df = conv_well_comp_df.astype({'year':str})
+    conv_well_comp_df.to_parquet(conv_well_comp_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_conv_well_count_proxy.py b/gch4i/proxy_processing/task_ng_conv_well_count_proxy.py
new file mode 100644
index 0000000..bc8c4d4
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_conv_well_count_proxy.py
@@ -0,0 +1,176 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    ng_well_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="ng_conv_well_count_proxy")
+def task_get_ng_conv_well_count_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    conv_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_count_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    conv_well_count_df = pd.DataFrame()  # Active conventional gas well counts in a given month
+
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Conventional Gas Well Count
+            conv_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']]
+                                     .query("HF != 'Y'")
+                                     .drop(columns=["HF"])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .rename(columns={"well_count":"proxy_data"})
+                                     .reset_index(drop=True)
+                                     )
+            conv_well_count_df = pd.concat([conv_well_count_df,conv_well_count_imonth])
+
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del conv_well_count_imonth
+
+    # Calculate relative emissions and convert to a geodataframe     
+    conv_well_count_df = calc_enverus_rel_emi(conv_well_count_df)
+    conv_well_count_df = enverus_df_to_gdf(conv_well_count_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, ng_well_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+
+    # Add NEI Data to Enverus Data
+    conv_well_count_df = pd.concat([conv_well_count_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = conv_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    conv_well_count_df = conv_well_count_df.astype({'year':str})
+    conv_well_count_df.to_parquet(conv_well_count_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_drilled_well_proxy.py b/gch4i/proxy_processing/task_ng_drilled_well_proxy.py
new file mode 100644
index 0000000..c85236c
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_drilled_well_proxy.py
@@ -0,0 +1,180 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    ng_spud_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="ng_drilled_well_proxy")
+def task_get_ng_drilled_well_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    drilled_well_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_drilled_well_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    drilled_well_df = pd.DataFrame()  # Gas wells drilled
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Drilled Gas Wells
+            drilled_well_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','spud_year','first_prod_year']]
+                                  .rename(columns=lambda x: str(x).lower())
+                                  .rename(columns={"well_count":"proxy_data"})
+                                  # wells with a spud date or first production date in the current year
+                                  .query(f"spud_year == '{iyear}' | first_prod_year == '{iyear}'")
+                                  # wells with a spud_year == iyear or if no spud date, first_prod_year == iyear
+                                  .query(f"spud_year == '{iyear}' | spud_year == 'NaN' | spud_year == 'nan'")
+                                  .drop(columns=['hf', 'spud_year', 'first_prod_year'])
+                                  .reset_index(drop=True)
+                                  )
+            drilled_well_df = pd.concat([drilled_well_df,drilled_well_imonth])
+    
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del drilled_well_imonth
+    
+    # Calculate relative emissions and convert to a geodataframe    
+    drilled_well_df = calc_enverus_rel_emi(drilled_well_df)
+    drilled_well_df = enverus_df_to_gdf(drilled_well_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, ng_spud_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    drilled_well_df = pd.concat([drilled_well_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = drilled_well_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    drilled_well_df = drilled_well_df.astype({'year':str})
+    drilled_well_df.to_parquet(drilled_well_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_hf_well_comp_proxy.py b/gch4i/proxy_processing/task_ng_hf_well_comp_proxy.py
new file mode 100644
index 0000000..7580554
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_hf_well_comp_proxy.py
@@ -0,0 +1,179 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    ng_comp_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="ng_hf_well_comp_proxy")
+def task_get_ng_hf_well_comp_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    hf_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_comp_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    hf_well_comp_df = pd.DataFrame()  # HF well completions
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str, 'comp_year_month': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str, "comp_year_month": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # HF Well Completions
+            hf_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']]
+                                    .query("HF == 'Y'")
+                                    .drop(columns=["HF"])
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .rename(columns={"well_count":"proxy_data"})
+                                    .query(f"comp_year_month == '{year_month_str}'")
+                                    .drop(columns=["comp_year_month"])
+                                    .reset_index(drop=True)
+                                    )
+            hf_well_comp_df = pd.concat([hf_well_comp_df,hf_well_comp_imonth])
+    
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del hf_well_comp_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    hf_well_comp_df  = calc_enverus_rel_emi(hf_well_comp_df )
+    hf_well_comp_df  = enverus_df_to_gdf(hf_well_comp_df )
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, ng_comp_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    hf_well_comp_df = pd.concat([hf_well_comp_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = hf_well_comp_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    hf_well_comp_df = hf_well_comp_df.astype({'year':str})
+    hf_well_comp_df.to_parquet(hf_well_comp_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_hf_well_count_proxy.py b/gch4i/proxy_processing/task_ng_hf_well_count_proxy.py
new file mode 100644
index 0000000..a29d6b8
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_hf_well_count_proxy.py
@@ -0,0 +1,177 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    ng_well_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="ng_hf_well_count_proxy")
+def task_get_ng_hf_well_count_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    hf_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_count_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    hf_well_count_df = pd.DataFrame()  # Active HF gas well counts in a given month
+
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # HF Gas Well Count
+            hf_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']]
+                                   .query("HF == 'Y'")
+                                   .drop(columns=["HF"])
+                                   .rename(columns=lambda x: str(x).lower())
+                                   .rename(columns={"well_count":"proxy_data"})
+                                   .reset_index(drop=True)
+                                   )
+            hf_well_count_df = pd.concat([hf_well_count_df,hf_well_count_imonth])
+
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del hf_well_count_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    hf_well_count_df = calc_enverus_rel_emi(hf_well_count_df)
+    hf_well_count_df = enverus_df_to_gdf(hf_well_count_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, ng_well_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+
+    # Add NEI Data to Enverus Data
+    hf_well_count_df = pd.concat([hf_well_count_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = hf_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    hf_well_count_df = hf_well_count_df.astype({'year':str})
+    hf_well_count_df.to_parquet(hf_well_count_output_path)
+
+    return None
+
diff --git a/gch4i/proxy_processing/federal_gom_offshore_proxy.py b/gch4i/proxy_processing/task_ng_oil_federal_gom_offshore_proxy.py
similarity index 99%
rename from gch4i/proxy_processing/federal_gom_offshore_proxy.py
rename to gch4i/proxy_processing/task_ng_oil_federal_gom_offshore_proxy.py
index 31468c8..0a34c88 100644
--- a/gch4i/proxy_processing/federal_gom_offshore_proxy.py
+++ b/gch4i/proxy_processing/task_ng_oil_federal_gom_offshore_proxy.py
@@ -29,8 +29,8 @@
 
 # %%
 @mark.persist
-@task(id="federal_gom_offshore_proxy")
-def task_get_federal_gom_offshore_proxy_data(
+@task(id="ng_oil_federal_gom_offshore_proxy")
+def task_get_ng_oil_federal_gom_offshore_proxy_data(
     state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
     GOADS_11_path: Path = sector_data_dir_path / "boem" / "2011_Gulfwide_Platform_Inventory.accdb",
     GOADS_14_path: Path = sector_data_dir_path / "boem" / "2014_Gulfwide_Platform_Inventory.accdb",
diff --git a/gch4i/proxy_processing/task_ng_oil_state_gom_offshore_proxy.py b/gch4i/proxy_processing/task_ng_oil_state_gom_offshore_proxy.py
new file mode 100644
index 0000000..ddf0923
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_oil_state_gom_offshore_proxy.py
@@ -0,0 +1,278 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+
+from pyarrow import parquet
+import pandas as pd
+import geopandas as gpd
+import numpy as np
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    global_data_dir_path,
+    sector_data_dir_path,
+    proxy_data_dir_path,
+    min_year,
+    max_year,
+    years,
+)
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+)
+
+# %%
+@mark.persist
+@task(id="ng_oil_state_gom_offshore_proxy")
+def task_get_ng_oil_state_gom_offshore_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    oil_state_gom_offshore_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_state_gom_offshore_well_count_proxy.parquet",
+    oil_pac_fed_state_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_pac_fed_state_proxy.parquet",
+    ng_state_gom_offshore_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_count_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, Prism only. Drilling Info data is not used for the offshore
+    well data because DI was only used for KS, MD, MI, MO, OK, and TN which are not
+    in the offshore region of the U.S.
+
+    States to produce offshore data: AL, CA, CAO (California Offshore), FL, LA, MS, TX
+    Note that there is no Enverus Prism data for FL and MS.
+    
+    """
+
+    # Well and Production Data (from Enverus)
+    # Read In and Format DI data
+    # 1. Read Data
+    # 2. Drop unused columns, rename columns
+    # 3. Calculate annual cummulate production totals
+    # 4. Save the data as a year-specific variable
+
+    # Based on ERGs logic, active wells are determined based on their production levels and not producing status
+    Enverus_data_dict = {}
+
+    for iyear in years:
+        # Prism Data
+        Prism_file_name = f"prism_monthly_wells_offshore_{iyear}.xlsx"
+        Prism_file_path = os.path.join(enverus_production_path, Prism_file_name)
+        Prism_data = (pd.read_excel(
+            Prism_file_path,
+            usecols={'STATE', 'LATITUDE', 'LONGITUDE', 'OFFSHORE', 'GOR_QUAL',
+                     'LIQUIDSPROD_BBL_01', 'GASPROD_MCF_01', 'WATERPROD_BBL_01',
+                     'LIQUIDSPROD_BBL_02', 'GASPROD_MCF_02', 'WATERPROD_BBL_02',
+                     'LIQUIDSPROD_BBL_03', 'GASPROD_MCF_03', 'WATERPROD_BBL_03',
+                     'LIQUIDSPROD_BBL_04', 'GASPROD_MCF_04', 'WATERPROD_BBL_04',
+                     'LIQUIDSPROD_BBL_05', 'GASPROD_MCF_05', 'WATERPROD_BBL_05',
+                     'LIQUIDSPROD_BBL_06', 'GASPROD_MCF_06', 'WATERPROD_BBL_06',
+                     'LIQUIDSPROD_BBL_07', 'GASPROD_MCF_07', 'WATERPROD_BBL_07',
+                     'LIQUIDSPROD_BBL_08', 'GASPROD_MCF_08', 'WATERPROD_BBL_08',
+                     'LIQUIDSPROD_BBL_09', 'GASPROD_MCF_09', 'WATERPROD_BBL_09',
+                     'LIQUIDSPROD_BBL_10', 'GASPROD_MCF_10', 'WATERPROD_BBL_10',
+                     'LIQUIDSPROD_BBL_11', 'GASPROD_MCF_11', 'WATERPROD_BBL_11',
+                     'LIQUIDSPROD_BBL_12', 'GASPROD_MCF_12', 'WATERPROD_BBL_12',
+                     })
+                     .rename(columns={'STATE':'STATE_CODE',
+                            'LIQUIDSPROD_BBL_01':'OILPROD_01','GASPROD_MCF_01':'GASPROD_01','WATERPROD_BBL_01':'WATERPROD_01',
+                            'LIQUIDSPROD_BBL_02':'OILPROD_02','GASPROD_MCF_02':'GASPROD_02','WATERPROD_BBL_02':'WATERPROD_02',
+                            'LIQUIDSPROD_BBL_03':'OILPROD_03','GASPROD_MCF_03':'GASPROD_03','WATERPROD_BBL_03':'WATERPROD_03',
+                            'LIQUIDSPROD_BBL_04':'OILPROD_04','GASPROD_MCF_04':'GASPROD_04','WATERPROD_BBL_04':'WATERPROD_04',
+                            'LIQUIDSPROD_BBL_05':'OILPROD_05','GASPROD_MCF_05':'GASPROD_05','WATERPROD_BBL_05':'WATERPROD_05',
+                            'LIQUIDSPROD_BBL_06':'OILPROD_06','GASPROD_MCF_06':'GASPROD_06','WATERPROD_BBL_06':'WATERPROD_06',
+                            'LIQUIDSPROD_BBL_07':'OILPROD_07','GASPROD_MCF_07':'GASPROD_07','WATERPROD_BBL_07':'WATERPROD_07',
+                            'LIQUIDSPROD_BBL_08':'OILPROD_08','GASPROD_MCF_08':'GASPROD_08','WATERPROD_BBL_08':'WATERPROD_08',
+                            'LIQUIDSPROD_BBL_09':'OILPROD_09','GASPROD_MCF_09':'GASPROD_09','WATERPROD_BBL_09':'WATERPROD_09',
+                            'LIQUIDSPROD_BBL_10':'OILPROD_10','GASPROD_MCF_10':'GASPROD_10','WATERPROD_BBL_10':'WATERPROD_10',
+                            'LIQUIDSPROD_BBL_11':'OILPROD_11','GASPROD_MCF_11':'GASPROD_11','WATERPROD_BBL_11':'WATERPROD_11',
+                            'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12',
+                            })
+                     .assign(WELL_COUNT=1)
+                     .query("OFFSHORE == 'Y'")
+                     )
+        
+        # Replace nans with zeros, and sum annual production
+        Prism_data.loc[:, Prism_data.columns.str.contains('GASPROD_')] = Prism_data.loc[:, Prism_data.columns.str.contains('GASPROD_')].fillna(0)
+        Prism_data.loc[:, Prism_data.columns.str.contains('OILPROD_')] = Prism_data.loc[:, Prism_data.columns.str.contains('OILPROD_')].fillna(0)
+        Prism_data.loc[:, Prism_data.columns.str.contains('WATERPROD_')] = Prism_data.loc[:, Prism_data.columns.str.contains('WATERPROD_')].fillna(0)
+
+        # Calculate cummulative annual production totals for Gas, Oil, Water
+        Prism_data['CUM_GAS'] = Prism_data.loc[:,Prism_data.columns.str.contains('GASPROD_')].sum(1)
+        Prism_data['CUM_OIL'] = Prism_data.loc[:,Prism_data.columns.str.contains('OILPROD_')].sum(1)
+        Prism_data['CUM_WATER'] = Prism_data.loc[:,Prism_data.columns.str.contains('WATERPROD_')].sum(1)
+                
+        # Save out the data for that year
+        Enverus_data_dict[f'{iyear}'] = Prism_data
+
+        del Prism_data
+
+    # Correct Enverus Data for Select States
+
+    # 1) Read In Coverage Table from State Well Counts File from ERG
+    # (specifies the first year with bad data and which years need to be corrected; 
+    # all years including and after the first bad year of data need to be corrected)
+
+    ERG_StateWellCounts_LastGoodDataYear = (pd.read_excel(
+        enverus_well_counts_path,
+        sheet_name = "2021 - Coverage",
+        usecols = {"State","Last Good Year"},
+        skiprows = 2,
+        nrows = 40)
+        )
+
+    # 2) Loops through the each state and year in Enverus to determine if the data for that particualar year needs to 
+    # be corrected. At the moment, the only corrections ERG makes to the data is to use the prior year of data if there
+    # is no new Enverus data reportd for that state. If a particular state is not included for any years in the Enverus
+    # dataset, then a row of zeros is added to the Enverus table for that year.
+
+    offshore_states = ['AL', 'CAO', 'FL', 'LA', 'MS', 'TX']
+        
+    for istate in np.arange(0,len(offshore_states)):
+        correctdata = 0
+        istate_code = offshore_states[istate]
+        lastgoodyear = ERG_StateWellCounts_LastGoodDataYear['Last Good Year'][ERG_StateWellCounts_LastGoodDataYear['State'] == istate_code].values
+        if lastgoodyear  == max_year:
+            lastgoodyear = max_year+5 #if state isn't included in correction list, don't correct any data
+        
+        for iyear in years:
+            enverus_data_temp= Enverus_data_dict[f'{iyear}'].copy()
+            state_list = np.unique(enverus_data_temp['STATE_CODE'])
+            if istate_code in state_list:
+                inlist =1
+            else:
+                inlist = 0
+            if inlist ==1 or correctdata==1: #if the state is included in Enverus data, or had data for at least one good year
+                #if first year, correctdata will be zero, but inlist will also be zero if no Enverus data
+                #check to see whether corrections are necessary for the given year/state
+                if iyear == (lastgoodyear):
+                    print(istate_code,iyear,'last good year')
+                    # This is the last year of good data. Do not correct the data but save
+                    # but so that this data can be used for all following years for that state
+                    temp_data = enverus_data_temp[enverus_data_temp['STATE_CODE'] == istate_code]
+                    correctdata=1
+                elif iyear > lastgoodyear: 
+                    print(istate_code,iyear)
+                    # correct data for all years equal to and after the first bad year (remove old data first if necessary)
+                    if inlist == 1:
+                        enverus_data_temp = enverus_data_temp[enverus_data_temp['STATE_CODE'] != istate_code]
+                    enverus_data_temp = pd.concat([enverus_data_temp,temp_data],ignore_index=True)
+                    print(istate_code +' data for ' +str(iyear) +' were corrected with '+str(lastgoodyear)+' data')
+                else:
+                    no_corrections =1
+                    
+            if inlist == 0 and correctdata == 0:
+            # if there is no Enverus data for a given state, and there was no good data, add a row with default values
+                print(istate_code + ' has no Enverus data in the year ' + str(iyear))
+                
+            # save that year of Enverus data
+            enverus_data_temp.reset_index(drop=True, inplace=True)
+            Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy()
+            tempoutput_filename = f'formatted_raw_enverus_offshore_tempoutput_{iyear}.csv'
+            tempoutput_filepath = os.path.join(intermediate_outputs_path, tempoutput_filename)
+            enverus_data_temp.to_csv(tempoutput_filepath, index=False)
+        
+    # create proxy files
+    ng_state_gom_offshore_df = pd.DataFrame()
+    oil_state_gom_offshore_df = pd.DataFrame()
+    oil_pac_fed_state_df = pd.DataFrame()
+
+    # ng proxy
+    for iyear in years:
+        ng_data_temp = (Enverus_data_dict[f'{iyear}']
+                        .query("STATE_CODE.isin(@offshore_states)")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1, 13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE',
+                'WELL_COUNT', gas_prod_str,]]
+                )
+            # State GOM Offshore Gas Well Count
+            ng_state_gom_offshore_imonth = (ng_data_imonth_temp[['year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE', 'WELL_COUNT']]
+                                            .rename(columns=lambda x: str(x).lower())
+                                            .rename(columns={"well_count": "proxy_data"})
+                                            .reset_index(drop=True)
+                                            )
+            ng_state_gom_offshore_df = pd.concat([ng_state_gom_offshore_df, ng_state_gom_offshore_imonth])
+
+    # oil proxies
+    for iyear in years:
+        oil_data_temp = (Enverus_data_dict[f'{iyear}']
+                         .query("STATE_CODE.isin(@offshore_states)")
+                         .query("CUM_OIL > 0")
+                         .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                         .assign(year=str(iyear))
+                         .replace(np.inf, 0)
+                         .query("gas_to_oil_ratio <= 100")
+                         .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                         )
+        # Include wells in map only for months where there is oil production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                    .query(f"{oil_prod_str} > 0")
+                                    .assign(year_month=str(iyear)+'-'+imonth_str)
+                                    )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE',
+                'WELL_COUNT', oil_prod_str,]]
+                )
+            # State GOM Offshore Oil Well Count
+            oil_state_gom_offshore_imonth = (oil_data_imonth_temp[['year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE', 'WELL_COUNT']]
+                                             .rename(columns=lambda x: str(x).lower())
+                                             .rename(columns={"well_count": "proxy_data"})
+                                             .query("state_code != 'CAO'")
+                                             .reset_index(drop=True)
+                                             )
+            oil_state_gom_offshore_df = pd.concat([oil_state_gom_offshore_df, oil_state_gom_offshore_imonth])
+            # Pacific Federal State Offshore Oil Well Count
+            oil_pac_fed_state_imonth = (oil_data_imonth_temp[['year', 'year_month', 'STATE_CODE', 'LATITUDE', 'LONGITUDE', 'WELL_COUNT']]
+                                        .rename(columns=lambda x: str(x).lower())
+                                        .rename(columns={"well_count": "proxy_data"})
+                                        .query("state_code == 'CAO'")
+                                        .reset_index(drop=True)
+                                        )
+            oil_pac_fed_state_df = pd.concat([oil_pac_fed_state_df, oil_pac_fed_state_imonth])
+
+    # Calculate relative emissions and convert to a geodataframe
+    ng_state_gom_offshore_df = calc_enverus_rel_emi(ng_state_gom_offshore_df)
+    ng_state_gom_offshore_df = enverus_df_to_gdf(ng_state_gom_offshore_df)
+    oil_state_gom_offshore_df = calc_enverus_rel_emi(oil_state_gom_offshore_df)
+    oil_state_gom_offshore_df = enverus_df_to_gdf(oil_state_gom_offshore_df)
+    oil_pac_fed_state_df = calc_enverus_rel_emi(oil_pac_fed_state_df)
+    oil_pac_fed_state_df = enverus_df_to_gdf(oil_pac_fed_state_df)
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = ng_state_gom_offshore_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+    sums = oil_state_gom_offshore_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+    sums = oil_pac_fed_state_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    ng_state_gom_offshore_df = ng_state_gom_offshore_df.astype({'year':str})
+    ng_state_gom_offshore_df.to_parquet(ng_state_gom_offshore_output_path)
+    oil_state_gom_offshore_df = oil_state_gom_offshore_df.astype({'year':str})
+    oil_state_gom_offshore_df.to_parquet(oil_state_gom_offshore_output_path)
+    oil_pac_fed_state_df = oil_pac_fed_state_df.astype({'year':str})
+    oil_pac_fed_state_df.to_parquet(oil_pac_fed_state_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_production_proxy.py b/gch4i/proxy_processing/task_ng_production_proxy.py
deleted file mode 100644
index 09fff7d..0000000
--- a/gch4i/proxy_processing/task_ng_production_proxy.py
+++ /dev/null
@@ -1,1007 +0,0 @@
-# %%
-from pathlib import Path
-import os
-from typing import Annotated
-from zipfile import ZipFile
-import calendar
-import datetime
-
-from pyarrow import parquet
-import pandas as pd
-import osgeo
-import geopandas as gpd
-import numpy as np
-import seaborn as sns
-import shapefile as shp
-from pytask import Product, task, mark
-
-from gch4i.config import (
-    V3_DATA_PATH,
-    proxy_data_dir_path,
-    global_data_dir_path,
-    sector_data_dir_path,
-    max_year,
-    min_year,
-    years,
-)
-
-from gch4i.utils import us_state_to_abbrev
-
-# %%
-@mark.persist
-@task(id="ng_production_proxy")
-def task_get_ng_production_proxy_data(
-    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
-    nems_region_dict_path: Path = sector_data_dir_path / "enverus/NEMS_Region_Dictionary.xlsx",
-    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
-    enverus_well_counts_path: Path = sector_data_dir_path / "enverus/production/temp_data_v2/Enverus DrillingInfo Processing - Well Counts_2021-03-17.xlsx",
-    nei_path: Path = sector_data_dir_path / "nei_og",
-    all_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_count_proxy.parquet",
-    conv_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_count_proxy.parquet",
-    hf_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_count_proxy.parquet",
-    all_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_all_well_prod_proxy.parquet",
-    basin_220_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_220_prod_proxy.parquet",
-    basin_395_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_395_prod_proxy.parquet",
-    basin_430_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_430_prod_proxy.parquet",
-    basin_other_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_basin_other_prod_proxy.parquet",
-    water_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_water_prod_proxy.parquet",
-    conv_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_conv_well_comp_proxy.parquet",
-    hf_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_hf_well_comp_proxy.parquet",
-    drilled_well_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_drilled_well_proxy.parquet",
-    state_gom_offshore_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_count_proxy.parquet",
-    state_gom_offshore_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_state_gom_offshore_well_prod_proxy.parquet",
-    ):
-    """
-    Data come from Enverus, both Drilling Info and Prism
-    The reason 2 datasets are used is because Prism does not include all states
-    So remaining states, or those with more DI coverage are taken from DI
-
-    DI: KS, MD, MI, MO, OK, TN
-
-    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
-    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
-    SD, TX, UT, VA, WV, WY
-
-    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
-    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
-    and gas production with an exception for IL and IN.
-
-    *IL and IN do not report to Enverus, but do have oil and gas production. Production
-    data is taken from the Energy Information Administration (EIA).
-
-    TODO: Update enverus_well_counts_path with v3 data (currently using v2 data)
-    """
-
-    # Functions:
-    # Define safe devide to set result to zero if denominator is zero
-    def safe_div(x,y):
-        if y == 0:
-            return 0
-        return x / y
-
-    # STEP 1: Load in State ANSI data and NEMS definitions
-
-    state_gdf = (
-        gpd.read_file(state_path)
-        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
-        .rename(columns=str.lower)
-        .rename(columns={"stusps": "state_code", "name": "state_name"})
-        .astype({"statefp": int})
-        # get only lower 48 + DC
-        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
-        .reset_index(drop=True)
-        .to_crs(4326)
-    )
-
-    # Make NEMS State classifications
-    # Treat NM and TX separately since these states cover multiple NEMS regions
-
-    # 0 = NE, 1 = MC, 2 = RM, 3 = SW, 4 = WC, 5 = GC, 6 = offshore
-    NEMS_State = pd.read_excel(nems_region_dict_path)
-    NEMS_State = NEMS_State.fillna(0)
-    NM_idx = NEMS_State.index[NEMS_State['State_Name'].str.contains('New Mexico')].tolist()
-    TX_idx = NEMS_State.index[NEMS_State['State_Name'].str.contains('Texas')].tolist()
-    idx = NM_idx+TX_idx
-    NEMS_State= NEMS_State.drop(NEMS_State.index[idx])
-    NEMS_State.reset_index(drop=True,inplace=True)
-
-    NEMS_dict = {'North East':0, 'Midcontinent':1,'Rocky Mountain':2,'South West':3,'West Coast':4,'Gulf Coast':5}
-
-    # STEP 2: Read-in and Format Proxy Data
-
-    # STEP 2.1: State Condensate Data
-
-    # TODO: state condensate data code
-
-    # STEP 2.2: GOADS Emissions Data
-
-    # TODO: GOADS emissions data code
-
-    # STEP 2.3: Well and Production Data (from Enverus)
-    
-    # STEP 2.3.1: Read In & Combine Each Year of Prism & DI Monthly Data (from Enverus)
-
-    # Data come from Enverus, both Drilling Info and Prism
-    # The reason 2 datasets are used is because Prism does not include all states
-    # So remaining states, or those with more DI coverage are taken from DI
-
-    # Read In and Format the Prism and DI data
-    # 1. Read Data
-    # 2. Drop unsed columns, rename columns to match between DI and Prism
-    # 3. Combine DI and Prism into one data array
-    # 4. Calculate annual cummulate production totals
-    # 5. Save the data as a year-specific variable
-
-    # Based on ERGs logic, active wells are determined based on their production levels and not producing status
-    Enverus_data_dict = {}
-    DI_data_dict = {}
-    Prism_data_dict = {}
-    for iyear in years:
-        #DI data
-        DI_file_name = f"didsk_monthly_{iyear}.csv"
-        DI_file_path = os.path.join(enverus_production_path, DI_file_name)
-        DI_data = (pd.read_csv(
-            DI_file_path,
-            usecols=['WELL_COUNT_ID','STATE','COUNTY','BASIN','AAPG_CODE_ERG',
-                     'NEMS_REGION_ERG','LATITUDE','LONGITUDE','STATUS','COMPDATE',
-                     'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR',
-                     'GOR_QUAL','PROD_FLAG','PRODYEAR',
-                     'LIQ_01','GAS_01','WTR_01','LIQ_02','GAS_02','WTR_02',
-                     'LIQ_03','GAS_03','WTR_03','LIQ_04','GAS_04','WTR_04',
-                     'LIQ_05','GAS_05','WTR_05','LIQ_06','GAS_06','WTR_06',
-                     'LIQ_07','GAS_07','WTR_07','LIQ_08','GAS_08','WTR_08',
-                     'LIQ_09','GAS_09','WTR_09','LIQ_10','GAS_10','WTR_10',
-                     'LIQ_11','GAS_11','WTR_11','LIQ_12','GAS_12','WTR_12',],
-            dtype={7:'str'})
-            .rename(columns={'WELL_COUNT_ID':'WELL_COUNT','STATE':'STATE_CODE',
-                             'NEMS_REGION_ERG':'NEMS_REGION', 'STATUS':'PRODUCING_STATUS',
-                             'LIQ_01':'OILPROD_01','GAS_01':'GASPROD_01','WTR_01':'WATERPROD_01',
-                             'LIQ_02':'OILPROD_02','GAS_02':'GASPROD_02','WTR_02':'WATERPROD_02',
-                             'LIQ_03':'OILPROD_03','GAS_03':'GASPROD_03','WTR_03':'WATERPROD_03',
-                             'LIQ_04':'OILPROD_04','GAS_04':'GASPROD_04','WTR_04':'WATERPROD_04',
-                             'LIQ_05':'OILPROD_05','GAS_05':'GASPROD_05','WTR_05':'WATERPROD_05',
-                             'LIQ_06':'OILPROD_06','GAS_06':'GASPROD_06','WTR_06':'WATERPROD_06',
-                             'LIQ_07':'OILPROD_07','GAS_07':'GASPROD_07','WTR_07':'WATERPROD_07',
-                             'LIQ_08':'OILPROD_08','GAS_08':'GASPROD_08','WTR_08':'WATERPROD_08',
-                             'LIQ_09':'OILPROD_09','GAS_09':'GASPROD_09','WTR_09':'WATERPROD_09',
-                             'LIQ_10':'OILPROD_10','GAS_10':'GASPROD_10','WTR_10':'WATERPROD_10',
-                             'LIQ_11':'OILPROD_11','GAS_11':'GASPROD_11','WTR_11':'WATERPROD_11',
-                             'LIQ_12':'OILPROD_12','GAS_12':'GASPROD_12','WTR_12':'WATERPROD_12',})
-            .assign(WELL_COUNT=1)  # TODO: Check to see if this should actually be set to 1
-            )
-        # Format completion date (YYYY-MM)
-        for iwell in range(0,len(DI_data)):
-            comp_date = str(DI_data.loc[iwell, 'COMPDATE'])
-            if comp_date == 'NaN':
-                comp_year_month = 'NaN'
-            elif comp_date == 'nan':
-                comp_year_month = 'NaN'
-            else:  # date format M/DD/YYYY
-                comp_month = f"{int(comp_date.split('/')[0]):02}"
-                comp_year = f"{int(comp_date.split('/')[2])}"
-                comp_year_month = str(comp_year)+'-'+str(comp_month)
-            DI_data.loc[iwell, 'comp_year_month'] = comp_year_month
-        # Format spud date (YYYY)
-        for iwell in range(0,len(DI_data)):
-            spud_date = str(DI_data.loc[iwell, 'SPUDDATE'])
-            if spud_date == 'NaN':
-                spud_year = 'NaN'
-            elif spud_date == 'nan':
-                spud_year = 'NaN'
-            else:  # date format M/DD/YYYY
-                spud_year = f"{int(spud_date.split('/')[2])}"
-                spud_year = str(spud_year)
-            DI_data.loc[iwell, 'spud_year'] = spud_year
-        # Format first production date (YYYY)
-        for iwell in range(0,len(DI_data)):
-            first_prod_date = str(DI_data.loc[iwell, 'FIRSTPRODDATE'])
-            if first_prod_date == 'NaN':
-                first_prod_year = 'NaN'
-            elif first_prod_date == 'nan':
-                first_prod_year = 'NaN'
-            else:  # date format M/DD/YYYY
-                first_prod_year = f"{int(first_prod_date.split('/')[2])}"
-                first_prod_year = str(first_prod_year)
-            DI_data.loc[iwell, 'first_prod_year'] = first_prod_year
-        DI_data_dict[f'{iyear}'] = DI_data
-
-        # Prism Data
-        Prism_file_name = f"prism_monthly_{iyear}.csv"
-        Prism_file_path = os.path.join(enverus_production_path, Prism_file_name)
-        Prism_data = (pd.read_csv(
-            Prism_file_path,
-            usecols=['STATE','COUNTY','ENVBASIN','AAPG_CODE_ERG',
-                     'NEMS_REGION_ERG','LATITUDE','LONGITUDE','ENVWELLSTATUS','COMPLETIONDATE',
-                     'SPUDDATE','FIRSTPRODDATE','HF','OFFSHORE','GOR',
-                     'GOR_QUAL','PROD_FLAG','PRODYEAR',
-                     'LIQUIDSPROD_BBL_01','GASPROD_MCF_01','WATERPROD_BBL_01',
-                     'LIQUIDSPROD_BBL_02','GASPROD_MCF_02','WATERPROD_BBL_02',
-                     'LIQUIDSPROD_BBL_03','GASPROD_MCF_03','WATERPROD_BBL_03',
-                     'LIQUIDSPROD_BBL_04','GASPROD_MCF_04','WATERPROD_BBL_04',
-                     'LIQUIDSPROD_BBL_05','GASPROD_MCF_05','WATERPROD_BBL_05',
-                     'LIQUIDSPROD_BBL_06','GASPROD_MCF_06','WATERPROD_BBL_06',
-                     'LIQUIDSPROD_BBL_07','GASPROD_MCF_07','WATERPROD_BBL_07',
-                     'LIQUIDSPROD_BBL_08','GASPROD_MCF_08','WATERPROD_BBL_08',
-                     'LIQUIDSPROD_BBL_09','GASPROD_MCF_09','WATERPROD_BBL_09',
-                     'LIQUIDSPROD_BBL_10','GASPROD_MCF_10','WATERPROD_BBL_10',
-                     'LIQUIDSPROD_BBL_11','GASPROD_MCF_11','WATERPROD_BBL_11',
-                     'LIQUIDSPROD_BBL_12','GASPROD_MCF_12','WATERPROD_BBL_12',],
-            dtype={7:'str'})
-            .rename(columns={'STATE':'STATE_CODE', 'ENVBASIN':'BASIN',
-                             'NEMS_REGION_ERG':'NEMS_REGION', 'ENVWELLSTATUS':'PRODUCING_STATUS',
-                             'COMPLETIONDATE':'COMPDATE',
-                             'LIQUIDSPROD_BBL_01':'OILPROD_01','GASPROD_MCF_01':'GASPROD_01','WATERPROD_BBL_01':'WATERPROD_01',
-                             'LIQUIDSPROD_BBL_02':'OILPROD_02','GASPROD_MCF_02':'GASPROD_02','WATERPROD_BBL_02':'WATERPROD_02',
-                             'LIQUIDSPROD_BBL_03':'OILPROD_03','GASPROD_MCF_03':'GASPROD_03','WATERPROD_BBL_03':'WATERPROD_03',
-                             'LIQUIDSPROD_BBL_04':'OILPROD_04','GASPROD_MCF_04':'GASPROD_04','WATERPROD_BBL_04':'WATERPROD_04',
-                             'LIQUIDSPROD_BBL_05':'OILPROD_05','GASPROD_MCF_05':'GASPROD_05','WATERPROD_BBL_05':'WATERPROD_05',
-                             'LIQUIDSPROD_BBL_06':'OILPROD_06','GASPROD_MCF_06':'GASPROD_06','WATERPROD_BBL_06':'WATERPROD_06',
-                             'LIQUIDSPROD_BBL_07':'OILPROD_07','GASPROD_MCF_07':'GASPROD_07','WATERPROD_BBL_07':'WATERPROD_07',
-                             'LIQUIDSPROD_BBL_08':'OILPROD_08','GASPROD_MCF_08':'GASPROD_08','WATERPROD_BBL_08':'WATERPROD_08',
-                             'LIQUIDSPROD_BBL_09':'OILPROD_09','GASPROD_MCF_09':'GASPROD_09','WATERPROD_BBL_09':'WATERPROD_09',
-                             'LIQUIDSPROD_BBL_10':'OILPROD_10','GASPROD_MCF_10':'GASPROD_10','WATERPROD_BBL_10':'WATERPROD_10',
-                             'LIQUIDSPROD_BBL_11':'OILPROD_11','GASPROD_MCF_11':'GASPROD_11','WATERPROD_BBL_11':'WATERPROD_11',
-                             'LIQUIDSPROD_BBL_12':'OILPROD_12','GASPROD_MCF_12':'GASPROD_12','WATERPROD_BBL_12':'WATERPROD_12',})
-            .assign(WELL_COUNT=1)
-            )
-        # Format completion date (YYYY-MM)
-        for iwell in range(0,len(Prism_data)):
-            comp_date = str(Prism_data.loc[iwell, 'COMPDATE'])
-            if comp_date == 'NaN':
-                comp_year_month = 'NaN'
-            elif comp_date == 'nan':
-                comp_year_month = 'NaN'
-            else:  # date format YYYY-MM-DD
-                comp_month = f"{int(comp_date.split('-')[1]):02}"
-                comp_year = f"{int(comp_date.split('-')[0])}"
-                comp_year_month = str(comp_year)+'-'+str(comp_month)
-            Prism_data.loc[iwell, 'comp_year_month'] = comp_year_month
-        # Format spud date (YYYY)
-        for iwell in range(0,len(Prism_data)):
-            spud_date = str(Prism_data.loc[iwell, 'SPUDDATE'])
-            if spud_date == 'NaN':
-                spud_year = 'NaN'
-            elif spud_date == 'nan':
-                spud_year = 'NaN'
-            else:  # date format YYYY-MM-DD
-                spud_year = f"{int(spud_date.split('-')[0])}"
-                spud_year = str(spud_year)
-            Prism_data.loc[iwell, 'spud_year'] = spud_year
-        # Format first production date (YYYY)
-        for iwell in range(0,len(Prism_data)):
-            first_prod_date = str(Prism_data.loc[iwell, 'FIRSTPRODDATE'])
-            if first_prod_date == 'NaN':
-                first_prod_year = 'NaN'
-            elif first_prod_date == 'nan':
-                first_prod_year = 'NaN'
-            else:  # date format YYYY-MM-DD
-                first_prod_year = f"{int(first_prod_date.split('-')[0])}"
-                first_prod_year = str(first_prod_year)
-            Prism_data.loc[iwell, 'first_prod_year'] = first_prod_year
-        Prism_data_dict[f'{iyear}'] = Prism_data
-        
-        # Combine into one array with common column names, replace nans with zeros, and sum annual production
-        Enverus_data = pd.concat([DI_data, Prism_data], ignore_index=True)
-        Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('GASPROD_')].fillna(0)
-        Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('OILPROD_')].fillna(0)
-        Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')] = Enverus_data.loc[:, Enverus_data.columns.str.contains('WATERPROD_')].fillna(0)
-
-        # Calculate cummulative annual production totals for Gas, Oil, Water
-        Enverus_data['CUM_GAS'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('GASPROD_')].sum(1)
-        Enverus_data['CUM_OIL'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('OILPROD_')].sum(1)
-        Enverus_data['CUM_WATER'] = Enverus_data.loc[:,Enverus_data.columns.str.contains('WATERPROD_')].sum(1)
-        
-        Enverus_data['NEMS_CODE'] = Enverus_data['NEMS_REGION'].map(NEMS_dict)
-        
-        # Save out the data for that year
-        Enverus_data_dict[f'{iyear}'] = Enverus_data
-
-        del Prism_data
-        del DI_data #save memory space 
-        
-        #define default values for a new row in this table (to be used later during data corrections)
-        default = {'WELL_COUNT': 0, 'STATE_CODE':'','COUNTY':'','NEMS_REGION':'UNK',
-                   'AAPG_CODE_ERG':'UNK','LATITUDE':0,'LONGITUDE':0,
-                   'PRODUCING_STATUS':'','BASIN':'','SPUDDATE':'','COMPDATE':'',
-                   'FIRSTPRODDATE':'','HF':'', 'OFFSHORE':'','GOR':-99,
-                   'GOR_QUAL':'','PROD_FLAG':'','PRODYEAR':'',
-                   'OILPROD_01':0, 'GASPROD_01':0, 'WATERPROD_01':0,'OILPROD_02':0, 'GASPROD_02':0, 'WATERPROD_02':0,
-                   'OILPROD_03':0, 'GASPROD_03':0, 'WATERPROD_03':0,'OILPROD_04':0, 'GASPROD_04':0, 'WATERPROD_04':0,\
-                   'OILPROD_05':0, 'GASPROD_05':0, 'WATERPROD_05':0,'OILPROD_06':0, 'GASPROD_06':0, 'WATERPROD_06':0,\
-                   'OILPROD_07':0, 'GASPROD_07':0, 'WATERPROD_07':0,'OILPROD_08':0, 'GASPROD_08':0, 'WATERPROD_08':0,\
-                   'OILPROD_09':0, 'GASPROD_09':0, 'WATERPROD_09':0,'OILPROD_10':0, 'GASPROD_10':0, 'WATERPROD_10':0,\
-                   'OILPROD_11':0, 'GASPROD_11':0, 'WATERPROD_11':0,'OILPROD_12':0, 'GASPROD_12':0, 'WATERPROD_12':0,
-                   'CUM_GAS':0, 'CUM_OIL':0, 'CUM_WATER':0, 'NEMS_CODE':99}
-
-    # Correct the NEMS Code for missing NEMS_REGIONS
-    # Note OFFSHORE regions will have NaN as NEMS_Code
-    for iyear in years:
-        enverus_data_temp = Enverus_data_dict[f'{iyear}']
-        list_well = enverus_data_temp.index[pd.isna(enverus_data_temp.loc[:,'NEMS_REGION'])].tolist()
-        if np.size(list_well) > 0:
-            for irow in list_well: 
-                match_state = np.where(NEMS_State['State_Code']==enverus_data_temp['STATE_CODE'][irow])[0][0]
-                enverus_data_temp.loc[irow,'NEMS_CODE'] = NEMS_State['NEMS'][match_state].astype(int)
-        Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy()
-
-    # STEP 2.3.2: Correct Enverus Data for Select States
-
-    # 1) Read In Coverage Table from State Well Counts File from ERG
-    # (specifies the first year with bad data and which years need to be corrected; 
-    # all years including and after the first bad year of data need to be corrected)
-
-    ERG_StateWellCounts_LastGoodDataYear = (pd.read_excel(
-        enverus_well_counts_path,
-        sheet_name = "2021 - Coverage",
-        usecols = {"State","Last Good Year"},
-        skiprows = 2,
-        nrows = 40)
-        )
-
-    # 2) Loops through the each state and year in Enverus to determine if the data for that particualar year needs to 
-    # be corrected. At the moment, the only corrections ERG makes to the data is to use the prior year of data if there
-    # is no new Enverus data reportd for that state. If a particular state is not included for any years in the Enverus
-    # dataset, then a row of zeros is added to the Enverus table for that year.
-         
-    for istate in np.arange(0,len(state_gdf)):
-        correctdata =0
-        istate_code = state_gdf['state_code'][istate]
-        lastgoodyear = ERG_StateWellCounts_LastGoodDataYear['Last Good Year'][ERG_StateWellCounts_LastGoodDataYear['State'] == istate_code].values
-        if lastgoodyear  == max_year:
-            lastgoodyear = max_year+5 #if state isn't included in correction list, don't correct any data
-        
-        for iyear in years:
-            enverus_data_temp= Enverus_data_dict[f'{iyear}'].copy()
-            state_list = np.unique(enverus_data_temp['STATE_CODE'])
-            if istate_code in state_list:
-                inlist =1
-            else:
-                inlist = 0
-            if inlist ==1 or correctdata==1: #if the state is included in Enverus data, or had data for at least one good year
-                #if first year, correctdata will be zero, but inlist will also be zero if no Enverus data
-                #check to see whether corrections are necessary for the given year/state
-                if iyear == (lastgoodyear):
-                    print(istate_code,iyear,'last good year')
-                    # This is the last year of good data. Do not correct the data but save
-                    # but so that this data can be used for all following years for that state
-                    temp_data = enverus_data_temp[enverus_data_temp['STATE_CODE'] == istate_code]
-                    correctdata=1
-                elif iyear > lastgoodyear: 
-                    print(istate_code,iyear)
-                    #correct data for all years equal to and after the first bad year (remove old data first if necessary)
-                    if inlist == 1:
-                        enverus_data_temp = enverus_data_temp[enverus_data_temp['STATE_CODE'] != istate_code]
-                    enverus_data_temp = pd.concat([enverus_data_temp,temp_data],ignore_index=True)
-                    print(istate_code +' data for ' +str(iyear) +' were corrected with '+str(lastgoodyear)+' data')
-                else:
-                    # year_range[iyear] < firstbadyear:
-                    #no data corrections if the current year is before the first bad year
-                    #print('no corrections')
-                    #print(state_str,year_range[iyear])
-                    no_corrections =1
-                    
-            if inlist==0 and correctdata==0:
-            #if there is no Enverus data for a given state, and there was no good data, add a row with default values
-                # temp_row = {'STATE':istate_code}
-                # enverus_data_temp = enverus_data_temp.append({**default,**temp_row}, ignore_index=True)
-                print(istate_code +' has no Enverus data in the year ' +str(iyear))
-                
-            #resave that year of Enverus data
-            enverus_data_temp.reset_index(drop=True,inplace=True)
-            Enverus_data_dict[f'{iyear}'] = enverus_data_temp.copy()
-
-    # STEP 2.4: Calculate Fractional Monthly Condensate Arrays
-    # (EIA condensate production (bbl) relative to producing Enverus gas wells by month
-    # in each state and region)
-
-    # TODO: fractional monthly condensate array code
-
-    # STEP 2.5: Convert Enverus Well Production Arrays and Condensate Array into Gridded
-    # Location Arrays
-
-    # clear variables
-    # del ERG_StateWellCounts_FirstBadDataYear
-    # del Prism_data
-    # del colnames
-    # del names
-    # del state_condensates
-    # del temp_data
-
-    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
-    # Includes NA Gas Wells and Production onshore in the CONUS region
-    # source emissions are related to the presence of a well and its production status (no emission if no production)
-    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
-    # Wells are not considered active for a given year if there is no production data that year
-    # This may cause wells that are completed but not yet producing to be dropped from the national count. 
-    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
-    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
-    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
-    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
-
-    # Proxy Data Dataframes:
-    # Well Counts
-    all_well_count_df = pd.DataFrame()  # Active gas well (conventional + HF) counts in a given month
-    conv_well_count_df = pd.DataFrame()  # Active conventional gas well counts in a given month
-    hf_well_count_df = pd.DataFrame()  # Active HF gas well counts in a given month
-    # Well-Level Production Volumes
-    all_well_prod_df = pd.DataFrame()  # Active gas well (conventional + HF) gas production in a given month
-    basin_220_prod_df = pd.DataFrame()  # Gas well gas production in Basin 220 in a given month
-    basin_395_prod_df = pd.DataFrame()  # Gas well gas production in Basin 395 in a given month
-    basin_430_prod_df = pd.DataFrame()  # Gas well gas production in Basin 430 in a given month
-    basin_other_prod_df = pd.DataFrame()  # Gas well gas production in Other Basins in a given month
-    # Water Production Volumes
-    water_prod_df = pd.DataFrame()
-    # Well Completions
-    conv_well_comp_df = pd.DataFrame()  # Conventional gas well completions
-    hf_well_comp_df = pd.DataFrame()  # HF gas well completions
-    # Drilled Gas Wells
-    drilled_well_df = pd.DataFrame()  # Gas wells drilled
-    # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
-    state_gom_offshore_well_count_df = pd.DataFrame()  # Offshore state GOM gas well counts
-    state_gom_offshore_well_prod_df = pd.DataFrame()  # Offshore state GOM gas production
-
-
-    # Query Enverus data to create dictionaries of proxy data
-    for iyear in years:
-        enverus_data_temp = Enverus_data_dict[f'{iyear}'].copy()
-        
-        # Onshore Natural Gas
-        ng_data_temp = (enverus_data_temp
-                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
-                        .query("OFFSHORE == 'N'")
-                        .query("CUM_GAS > 0")
-                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
-                        .assign(year=str(iyear))
-                        .replace(np.inf, 0)
-                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
-                        )
-        # Offshore Natural Gas Wells
-        ng_offshore_data_temp = (enverus_data_temp
-                                 .query("STATE_CODE.isin(@state_gdf['state_code'])")
-                                 .query("OFFSHORE == 'Y'")
-                                 .query("CUM_GAS > 0")
-                                 .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
-                                 .assign(year=str(iyear))
-                                 .replace(np.inf, 0)
-                                 .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
-                                 )
-        
-        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
-        for imonth in range(1,13):
-            imonth_str = f"{imonth:02}"  # convert to 2-digit months
-            year_month_str = str(iyear)+'-'+imonth_str
-            gas_prod_str = 'GASPROD_'+imonth_str
-            water_prod_str = 'WATERPROD_'+imonth_str
-            # onshore data for imonth
-            ng_data_imonth_temp = (ng_data_temp
-                                   .query(f"{prod_str} > 0")
-                                   .assign(year_month=str(iyear)+'-'+imonth_str)
-                                   )
-            ng_data_imonth_temp = (ng_data_imonth_temp[[
-                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
-                'HF','WELL_COUNT',gas_prod_str,water_prod_str,
-                'comp_year_month','spud_year','first_prod_year']]
-                )
-            # offshore data for imonth
-            ng_offshore_data_imonth_temp = (ng_offshore_data_temp
-                                            .query(f"{prod_str} > 0")
-                                            .assign(year_month=str(iyear)+'-'+imonth_str)
-                                            )
-            ng_data_imonth_temp = (ng_offshore_data_imonth_temp[[
-                'year','year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
-                'HF','WELL_COUNT',gas_prod_str,water_prod_str,
-                'comp_year_month','spud_year','first_prod_year']]
-                )
-            # Well Counts
-            # All Gas Well Count
-            all_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']]
-                                    .rename(columns=lambda x: str(x).lower())
-                                    .rename(columns={"well_count":"proxy_data"})
-                                    .reset_index(drop=True)
-                                    )
-            all_well_count_df = pd.concat([all_well_count_df,all_well_count_imonth])
-            # Conventional Gas Well Count
-            conv_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']]
-                                     .query("HF != 'Y'")
-                                     .drop(columns=["HF"])
-                                     .rename(columns=lambda x: str(x).lower())
-                                     .rename(columns={"well_count":"proxy_data"})
-                                     .reset_index(drop=True)
-                                     )
-            conv_well_count_df = pd.concat([conv_well_count_df,conv_well_count_imonth])
-            # HF Gas Well Count
-            hf_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']]
-                                   .query("HF == 'Y'")
-                                   .drop(columns=["HF"])
-                                   .rename(columns=lambda x: str(x).lower())
-                                   .rename(columns={"well_count":"proxy_data"})
-                                   .reset_index(drop=True)
-                                   )
-            hf_well_count_df = pd.concat([hf_well_count_df,hf_well_count_imonth])
-
-            # Gas Production
-            # All Gas Well Gas Production
-            all_well_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]]
-                                   .assign(proxy_data=lambda df: df[gas_prod_str])
-                                   .drop(columns=[gas_prod_str])
-                                   .rename(columns=lambda x: str(x).lower())
-                                   .reset_index(drop=True)
-                                   )
-            all_well_prod_df = pd.concat([all_well_prod_df,all_well_prod_imonth])
-            # Basin 220 Gas Well Gas Production
-            basin_220_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
-                                    .query("AAPG_CODE_ERG == '220'")
-                                    .assign(proxy_data=lambda df: df[gas_prod_str])
-                                    .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
-                                    .rename(columns=lambda x: str(x).lower())
-                                    .reset_index(drop=True)
-                                    )
-            basin_220_prod_df = pd.concat([basin_220_prod_df,basin_220_prod_imonth])
-            # Basin 395 Gas Well Gas Production
-            basin_395_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
-                                    .query("AAPG_CODE_ERG == '395'")
-                                    .assign(proxy_data=lambda df: df[gas_prod_str])
-                                    .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
-                                    .rename(columns=lambda x: str(x).lower())
-                                    .reset_index(drop=True)
-                                    )
-            basin_395_prod_df = pd.concat([basin_395_prod_df,basin_395_prod_imonth])
-            # Basin 430 Gas Well Gas Production
-            basin_430_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
-                                    .query("AAPG_CODE_ERG == '430'")
-                                    .assign(proxy_data=lambda df: df[gas_prod_str])
-                                    .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
-                                    .rename(columns=lambda x: str(x).lower())
-                                    .reset_index(drop=True)
-                                    )
-            basin_430_prod_df = pd.concat([basin_430_prod_df,basin_430_prod_imonth])
-            # Other Basins Gas Well Gas Production
-            basin_other_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',gas_prod_str]]
-                                      .query("AAPG_CODE_ERG != '220' & AAPG_CODE_ERG != '395' & AAPG_CODE_ERG != '430'")
-                                      .assign(proxy_data=lambda df: df[gas_prod_str])
-                                      .drop(columns=[gas_prod_str, 'AAPG_CODE_ERG'])
-                                      .rename(columns=lambda x: str(x).lower())
-                                      .reset_index(drop=True)
-                                      )
-            basin_other_prod_df = pd.concat([basin_other_prod_df,basin_other_prod_imonth])
-
-            # Water Production
-            # Data Source by state defined in Enverus DrillingInfo Processing - Produced
-            # Water_2023-11-14_forGridding.xlsx file.
-            if iyear < 2016:  # WV uses NEI data
-                water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA',
-                                             'MI','MO','MS','MT','ND','NE','NM','NV',
-                                             'NY','OH','SD','TX','UT','VA','WY'
-                                             ]
-                # States using NEI for reference: ['IL','IN','KS','OK','PA','WV']
-            else:  # 2016 and beyond; WV uses Enverus data
-                water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA',
-                                             'MI','MO','MS','MT','ND','NE','NM','NV',
-                                             'NY','OH','SD','TX','UT','VA','WY','WV'
-                                             ]  #WV uses Enverus
-                # States using NEI for reference: ['IL','IN','KS','OK','PA']
-            # Enverus water production for applicable states (NEI water producted will
-            # be added in the NEI section of the code below)
-            water_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',water_prod_str]]
-                                .query("STATE_CODE.isin(@water_prod_enverus_states)")
-                                .assign(proxy_data=lambda df: df[water_prod_str])
-                                .drop(columns=[water_prod_str])
-                                .rename(columns=lambda x: str(x).lower())
-                                .reset_index(drop=True)
-                                )
-            water_prod_df = pd.concat([water_prod_df,water_prod_imonth])
-
-            # Well Completions
-            # Conventional Gas Well Completions
-            conv_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']]
-                                    .query("HF != 'Y'")
-                                    .drop(columns=["HF"])
-                                    .rename(columns=lambda x: str(x).lower())
-                                    .rename(columns={"well_count":"proxy_data"})
-                                    .query(f"comp_year_month == {year_month_str}")
-                                    .drop(columns=["comp_year_month"])
-                                    .reset_index(drop=True)
-                                    )
-            conv_well_comp_df = pd.concat([conv_well_comp_df,conv_well_comp_imonth])
-            
-            # HF Gas Well Completions
-            hf_well_comp_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']]
-                                  .query("HF == 'Y'")
-                                  .drop(columns=["HF"])
-                                  .rename(columns=lambda x: str(x).lower())
-                                  .rename(columns={"well_count":"proxy_data"})
-                                  .query(f"comp_year_month == '{year_month_str}'")
-                                  .drop(columns=["comp_year_month"])
-                                  .reset_index(drop=True)
-                                  )
-            hf_well_comp_df = pd.concat([hf_well_comp_df,hf_well_comp_imonth])
-
-            # Drilled Gas Wells
-            drilled_well_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','spud_year','first_prod_year']]
-                                  .rename(columns=lambda x: str(x).lower())
-                                  .rename(columns={"well_count":"proxy_data"})
-                                  # wells with a spud date or first production date in the current year
-                                  .query(f"spud_year == '{iyear}' | first_prod_year == '{iyear}'")
-                                  # wells with a spud_year == iyear or if no spud date, first_prod_year == iyear
-                                  .query(f"spud_year == '{iyear}' | spud_year == 'NaN'")
-                                  .drop(columns=['hf', 'spud_year', 'first_prod_year'])
-                                  .reset_index(drop=True)
-                                  )
-            drilled_well_df = pd.concat([drilled_well_df,drilled_well_imonth])
-            
-            # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
-            state_gom_offshore_states = ['AL','FL','LA','MS','TX']
-            # Offshore State GOM Gas Well Counts
-            state_gom_offshore_well_count_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']]
-                                                   .rename(columns=lambda x: str(x).lower())
-                                                   .rename(columns={"well_count":"proxy_data"})
-                                                   .reset_index(drop=True)
-                                                   )
-            state_gom_offshore_well_count_df = pd.concat([state_gom_offshore_well_count_df,state_gom_offshore_well_count_imonth])
-            # Offshore State GOM Gas Well Gas Production
-            state_gom_offshore_well_prod_imonth = (ng_offshore_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',gas_prod_str]]
-                                                  .query("STATE_CODE.isin(@state_gom_offshore_states)")
-                                                  .assign(proxy_data=lambda df: df[gas_prod_str])
-                                                  .drop(columns=[gas_prod_str])
-                                                  .rename(columns=lambda x: str(x).lower())
-                                                  .reset_index(drop=True)
-                                                  )
-            state_gom_offshore_well_prod_df = pd.concat([state_gom_offshore_well_prod_df,state_gom_offshore_well_prod_imonth])
-
-    # Calculate Relative Emissions
-    def calc_enverus_rel_emi(df):
-        df['rel_emi'] = df.groupby(["state_code", "year"])['proxy_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
-        df = df.drop(columns='proxy_data')
-        return df
-
-    # Well Counts
-    all_well_count_df = calc_enverus_rel_emi(all_well_count_df)
-    conv_well_count_df = calc_enverus_rel_emi(conv_well_count_df)
-    hf_well_count_df = calc_enverus_rel_emi(hf_well_count_df)
-    # Well-Level Production Volumes
-    all_well_prod_df = calc_enverus_rel_emi(all_well_prod_df)
-    basin_220_prod_df = calc_enverus_rel_emi(basin_220_prod_df)
-    basin_395_prod_df = calc_enverus_rel_emi(basin_395_prod_df)
-    basin_430_prod_df = calc_enverus_rel_emi(basin_430_prod_df)
-    basin_other_prod_df = calc_enverus_rel_emi(basin_other_prod_df)
-    # Water Production Volumes
-    water_prod_df = calc_enverus_rel_emi(water_prod_df)
-    # Well Completions
-    conv_well_comp_df = calc_enverus_rel_emi(conv_well_comp_df)
-    hf_well_comp_df = calc_enverus_rel_emi(hf_well_comp_df)
-    # Drilled Gas Wells
-    drilled_well_df = calc_enverus_rel_emi(drilled_well_df)
-    # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
-    state_gom_offshore_well_count_df = calc_enverus_rel_emi(state_gom_offshore_well_count_df)
-    state_gom_offshore_well_prod_df = calc_enverus_rel_emi(state_gom_offshore_well_prod_df)
-
-    # Format Proxy Data into Geodataframes
-    def enverus_df_to_gdf(df):
-        gdf = (
-            gpd.GeoDataFrame(
-                df,
-                geometry=gpd.points_from_xy(
-                    df["longitude"],
-                    df["latitude"],
-                    crs=4326
-                )
-            )
-            .drop(columns=["latitude", "longitude"])
-            .loc[:, ["year", "year_month", "state_code", "rel_emi", "geometry"]]
-        )
-        return gdf
-
-    # Well Counts
-    all_well_count_gdf = enverus_df_to_gdf(all_well_count_df)
-    conv_well_count_gdf = enverus_df_to_gdf(conv_well_count_df)
-    hf_well_count_gdf = enverus_df_to_gdf(hf_well_count_df)
-    # Well-Level Production Volumes
-    all_well_prod_gdf = enverus_df_to_gdf(all_well_prod_df)
-    basin_220_prod_gdf = enverus_df_to_gdf(basin_220_prod_df)
-    basin_395_prod_gdf = enverus_df_to_gdf(basin_395_prod_df)
-    basin_430_prod_gdf = enverus_df_to_gdf(basin_430_prod_df)
-    basin_other_prod_gdf = enverus_df_to_gdf(basin_other_prod_df)
-    # Water Production Volumes
-    water_prod_gdf = enverus_df_to_gdf(water_prod_df)
-    # Well Completions
-    conv_well_comp_gdf = enverus_df_to_gdf(conv_well_comp_df)
-    hf_well_comp_gdf = enverus_df_to_gdf(hf_well_comp_df)
-    # Drilled Gas Wells
-    drilled_well_gdf = enverus_df_to_gdf(drilled_well_df)
-    # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
-    state_gom_offshore_well_count_gdf = enverus_df_to_gdf(state_gom_offshore_well_count_df)
-    state_gom_offshore_well_prod_gdf = enverus_df_to_gdf(state_gom_offshore_well_prod_df)
-
-    # STEP 2.4: Well and Production Data (from NEI)
-
-    # NEI data is used for well counts, gas well completion counts, 
-    # gas well drilled counts, and gas production volumes for IL and IN.
-
-    # NEI data is used for water production volumes for IL, IN, KS, OK, and PA 
-    # as well as WV for years less than 2016.
-
-    # FIPS codes for relevant states (each code starts with 2 distinct characters):
-    # IL: 17; IN: 18; KS: 20; OK: 40; PA: 42; WV: 54
-    
-    fips_codes_df = pd.DataFrame({'state_code': ['IL', 'IN', 'KS', 'OK', 'PA', 'WV'],
-                                  'fips_code': ['17', '18', '20', '40', '42', '54']})
-
-    # Function to get NEI textfile and shapefile data
-    def get_NEI_data(ghgi_year, data_year, file_name):
-        if data_year <= 2017:
-            # NEI textfile data (data_year <= 2017) (2011, 2014, 2016, 2017)
-            nei_textfile_name = f"CONUS_SA_FILES_{data_year}/{file_name}"
-            nei_textfile_path = os.path.join(nei_path, nei_textfile_name)
-            data_temp = pd.read_csv(nei_textfile_path, sep='\t', skiprows = 25)
-            data_temp = data_temp.drop(["!"], axis=1)
-            data_temp.columns = ['Code','FIPS','COL','ROW','Frac','Abs','FIPS_Total','FIPS_Running_Sum']
-            data_temp = data_temp.astype({"FIPS": str})
-            # if water production data (gas: 6832, oil: 6833)
-            if file_name == 'USA_6832_NOFILL.txt' or file_name == 'USA_6833_NOFILL.txt':
-                if data_year < 2016:
-                    data_temp = (data_temp
-                                # query states: IL, IN, KS, OK, PA, WV
-                                .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42') | FIPS.str.startswith('54')")
-                                .reset_index(drop=True)
-                                )
-                    colmax = data_temp['COL'].max()
-                    colmin = data_temp['COL'].min()
-                    rowmax = data_temp['ROW'].max()
-                    rowmin = data_temp['ROW'].min()
-                else:
-                    data_temp = (data_temp
-                                # query states: IL, IN, KS, OK, PA
-                                .query("FIPS.str.startswith('17') | FIPS.str.startswith('18') | FIPS.str.startswith('20') | FIPS.str.startswith('40') | FIPS.str.startswith('42')")
-                                .reset_index(drop=True)
-                                )
-                    colmax = data_temp['COL'].max()
-                    colmin = data_temp['COL'].min()
-                    rowmax = data_temp['ROW'].max()
-                    rowmin = data_temp['ROW'].min()
-            # non-water production proxies (IL, IN)
-            else:
-                data_temp = (data_temp
-                            # query states: IL, IN
-                            .query("FIPS.str.startswith('17') | FIPS.str.startswith('18')")
-                            .reset_index(drop=True)
-                            )
-                colmax = data_temp['COL'].max()
-                colmin = data_temp['COL'].min()
-                rowmax = data_temp['ROW'].max()
-                rowmin = data_temp['ROW'].min()
-            # NEI reference grid shapefile with lat/lon locations
-            nei_reference_grid_path = os.path.join(nei_path, "NEI_Reference_Grid_LCC_to_WGS84_latlon.shp")
-            nei_reference_grid = (gpd.read_file(nei_reference_grid_path)
-                                .to_crs(4326))
-            nei_reference_grid = (nei_reference_grid
-                                .assign(cellid_column = nei_reference_grid.cellid.astype(str).str[0:4].astype(int))
-                                .assign(cellid_row = nei_reference_grid.cellid.astype(str).str[5:].astype(int))
-                                .query(f"cellid_column <= {colmax} & cellid_column >= {colmin}")
-                                .query(f"cellid_row <= {rowmax} & cellid_row >= {rowmin}")
-                                .reset_index(drop=True)
-                                )
-            # Match lat/lon locations from reference grid to nei data
-            for idx in np.arange(0,len(data_temp)):
-                # Add in lat/lon
-                icol = data_temp['COL'][idx]
-                irow = data_temp['ROW'][idx]
-                match = np.where((icol == nei_reference_grid.loc[:,'cellid_column']) & (irow == nei_reference_grid.loc[:,'cellid_row']))[0][0]
-                match = int(match)
-                # data_temp.loc[idx,'Lat'] = nei_reference_grid.loc[match, 'Latitude']
-                # data_temp.loc[idx,'Lon'] = nei_reference_grid.loc[match, 'Longitude']
-                data_temp.loc[idx,'geometry'] = nei_reference_grid.loc[match, 'geometry']
-                # Add in state_code
-                ifips = data_temp.loc[idx,'FIPS'][0:2]
-                data_temp.loc[idx,'state_code'] = fips_codes_df.loc[np.where(ifips == fips_codes_df.loc[:, 'fips_code'])[0][0],'state_code']
-            data_temp = data_temp[['state_code', 'Abs', 'geometry']]
-            data_temp = data_temp.rename(columns={'Abs':'activity_data'})
-        
-        else:
-            # NEI shapefile data (data_year > 2017) (2018, 2019, 2021, 2022)
-            state_geometries = state_gdf[["state_code","geometry"]]
-            nei_file_name = f"CONUS_SA_FILES_{data_year}"
-            nei_file_path = os.path.join(nei_path, nei_file_name)
-            data_temp = gpd.read_file(nei_file_path, layer=file_name)
-            data_temp = data_temp.to_crs(4326)
-            data_temp = gpd.tools.sjoin(data_temp, state_gdf, how="left")
-
-            # water production data (IL, IN, KS, OK, PA)
-            if file_name == 'PRODUCED_WATER_GAS' or file_name == '_6832' or file_name == 'ProducedWaterGasWells':
-                states_to_query = ['IL', 'IN', 'KS', 'OK', 'PA']
-            # non-water production proxies (IL, IN)
-            else:
-                states_to_query = ['IL', 'IN']
-            
-            # query relevant states
-            data_temp = data_temp.query('state_code.isin(@states_to_query)')
-
-            # grab activity data depending on column name (changes by year)
-            if data_year == 2018 or data_year == 2019 or data_year == 2020:
-                data_temp = data_temp[['state_code', 'ACTIVITY', 'geometry']]
-                data_temp = data_temp.rename(columns={'ACTIVITY':'activity_data'})            
-            if data_year == 2021:
-                data_temp = data_temp[['state_code', 'GRID_AC', 'geometry']]
-                data_temp = data_temp.rename(columns={'GRID_AC':'activity_data'})
-            if data_year == 2022:
-                data_temp = data_temp[['state_code', 'GRID_ACTIV', 'geometry']]
-                data_temp = data_temp.rename(columns={'GRID_ACTIV':'activity_data'})
-        
-        # convert activity data to relative emissions (idata / sum(state data))
-        data_temp['rel_emi'] = data_temp.groupby(["state_code"])['activity_data'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
-        monthly_data_temp = data_temp.copy()
-        monthly_data_temp['rel_emi'] = monthly_data_temp['rel_emi'] * 1/12
-        monthly_data_temp = monthly_data_temp.drop(columns='activity_data')
-
-        # convert proxy data to monthly (assume 1/12 of annual proxy is assigned to each month)
-        nei_proxy_data = pd.DataFrame()
-        for imonth in range(1,13):
-            imonth_str = f"{imonth:02}"  # convert to 2-digit months
-            data_temp_imonth = monthly_data_temp.copy()
-            data_temp_imonth = data_temp_imonth.assign(year_month=str(ghgi_year)+'-'+imonth_str)
-            nei_proxy_data = pd.concat([nei_proxy_data,data_temp_imonth])
-        nei_proxy_data = nei_proxy_data.assign(year=ghgi_year)
-        nei_proxy_data = (nei_proxy_data[['year', 'year_month', 'state_code', 'rel_emi', 'geometry']]
-                          .reset_index(drop=True)
-                          )
-        return nei_proxy_data
-
-    # NEI data year assignments
-    # All years use the data affiliated with their year except the following exceptions:
-        # 2012: use 2011 data
-        # 2013: use 2014 data
-        # 2015: use 2014 data
-        # 2016: use 2017 data
-    nei_data_years = pd.DataFrame({'year': [2012,
-                                            2013,
-                                            2014,
-                                            2015,
-                                            2016,
-                                            2017,
-                                            2018,
-                                            2019,
-                                            2020,
-                                            2021,
-                                            2022], 
-                                   'nei_data': [2011,
-                                                2014,
-                                                2014,
-                                                2014,
-                                                2017,
-                                                2017,
-                                                2018,
-                                                2019,
-                                                2020,
-                                                2021,
-                                                2022]})
-
-    # NEI Data Dataframes:
-    # Well Counts
-    nei_all_well_count_df = pd.DataFrame()  # Active gas well (conventional + HF) counts in a given month
-    nei_conv_well_count_df = pd.DataFrame()  # Active conventional gas well counts in a given month
-    nei_hf_well_count_df = pd.DataFrame()  # Active HF gas well counts in a given month
-    # Well-Level Production Volumes
-    nei_all_well_prod_df = pd.DataFrame()  # Active gas well (conventional + HF) gas production in a given month
-    nei_basin_other_prod_df = pd.DataFrame()  # Gas well gas production in Other Basins in a given month
-    # Water Production Volumes
-    nei_water_prod_df = pd.DataFrame()
-    # Well Completions
-    nei_conv_well_comp_df = pd.DataFrame()  # Conventional gas well completions
-    nei_hf_well_comp_df = pd.DataFrame()  # HF gas well completions
-    # Drilled Gas Wells
-    nei_drilled_well_df = pd.DataFrame()  # Gas wells drilled
-
-    # NEI text file and shapefile names:
-    # Well Counts
-    well_count_file_names = pd.DataFrame({
-        'data_year': [2011, 2014, 2017,
-                      2018, 2019, 2020, 2021, 2022],
-        'file_name': ['USA_698_NOFILL.txt', 'USA_698_NOFILL.txt', 'USA_698_NOFILL.txt',
-                      'GAS_WELLS', 'GAS_WELLS', 'GAS_WELL', '_698', 'GasWells'],
-        })
-    # Well-Level Production Volumes
-    gas_prod_file_names = pd.DataFrame({
-        'data_year': [2011, 2014, 2017,
-                      2018, 2019, 2020, 2021, 2022],
-        'file_name': ['USA_696_NOFILL.txt', 'USA_696_NOFILL.txt', 'USA_696_NOFILL.txt',
-                      'GAS_PRODUCTION', 'GAS_PRODUCTION', 'GAS_PRODUCTION', '_696', 'GasProduction'],
-        })
-    # Water Production Volumes
-    water_prod_file_names = pd.DataFrame({
-        'data_year': [2011, 2014, 2017,
-                      2018, 2019, 2020, 2021, 2022],
-        'file_name': ['USA_6832_NOFILL.txt', 'USA_6832_NOFILL.txt', 'USA_6832698_NOFILL.txt',
-                      'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', 'PRODUCED_WATER_GAS', '_6832', 'ProducedWaterGasWells'],
-        })
-    # Well Completions
-    comp_count_file_names = pd.DataFrame({
-        'data_year': [2011, 2014, 2017,
-                      2018, 2019, 2020, 2021, 2022],
-        'file_name': ['USA_678_NOFILL.txt', 'USA_678_NOFILL.txt', 'USA_678_NOFILL.txt',
-                      'COMPLETIONS_GAS', 'COMPLETIONS_GAS', 'COMPLETIONS_GAS', '_678', 'GasWellCompletions'],
-        })
-    # Drilled Gas Wells
-    spud_count_file_names = pd.DataFrame({
-        'data_year': [2011, 2014, 2017,
-                      2018, 2019, 2020, 2021, 2022],
-        'file_name': ['USA_671_NOFILL.txt', 'USA_671_NOFILL.txt', 'USA_671_NOFILL.txt',
-                      'SPUD_GAS', 'SPUD_GAS', 'SPUD_GAS', '_671', 'SpudCountGasWells'],
-        })
-    
-    
-    def get_nei_file_name(nei_data_year, nei_file_names):
-        nei_file_name = nei_file_names[nei_file_names['data_year'] == nei_data_year]['file_name'].values[0]
-        return nei_file_name
-
-
-    for iyear in years:
-        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
-        # Well Count
-        ifile_name = get_nei_file_name(nei_data_year, well_count_file_names)
-        nei_all_well_count_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
-        nei_all_well_count_df = pd.concat([nei_all_well_count_df, nei_all_well_count_iyear])
-        # Gas Production
-        ifile_name = get_nei_file_name(nei_data_year, gas_prod_file_names)
-        nei_all_well_prod_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
-        nei_all_well_prod_df = pd.concat([nei_all_well_prod_df, nei_all_well_prod_iyear])
-        # Water Production
-        ifile_name = get_nei_file_name(nei_data_year, water_prod_file_names)
-        nei_water_prod_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
-        nei_water_prod_df = pd.concat([nei_water_prod_df, nei_water_prod_iyear])
-        # Completions Count
-        ifile_name = get_nei_file_name(nei_data_year, comp_count_file_names)
-        nei_conv_well_comp_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
-        nei_conv_well_comp_df = pd.concat([nei_conv_well_comp_df, nei_conv_well_comp_iyear])
-        # Spud Count
-        ifile_name = get_nei_file_name(nei_data_year, spud_count_file_names)
-        nei_drilled_well_iyear = get_NEI_data(iyear, nei_data_year, ifile_name)
-        nei_drilled_well_df = pd.concat([nei_drilled_well_df, nei_drilled_well_iyear])
-    
-    # Copy Data to Other Dataframes
-    nei_conv_well_count_df = nei_all_well_count_df.copy()
-    nei_hf_well_count_df = nei_all_well_count_df.copy()
-    nei_basin_other_prod_df = nei_all_well_prod_df.copy()
-    nei_hf_well_comp_df = nei_conv_well_comp_df.copy()
-
-    # Add NEI Data to Enverus Data
-    # Well Counts
-    all_well_count_gdf = pd.concat([all_well_count_gdf, nei_all_well_count_df]).reset_index(drop=True)
-    conv_well_count_gdf = pd.concat([conv_well_count_gdf, nei_conv_well_count_df]).reset_index(drop=True)
-    hf_well_count_gdf = pd.concat([hf_well_count_gdf, nei_hf_well_count_df]).reset_index(drop=True)
-    # Well-Level Production Volumes
-    all_well_prod_gdf = pd.concat([all_well_prod_gdf, nei_all_well_prod_df]).reset_index(drop=True)
-    basin_220_prod_gdf = basin_220_prod_df.reset_index(drop=True)  # No IL/IN data to add
-    basin_395_prod_gdf = basin_395_prod_df.reset_index(drop=True)  # No IL/IN data to add
-    basin_430_prod_gdf = basin_430_prod_df.reset_index(drop=True)  # No IL/IN data to add
-    basin_other_prod_gdf = pd.concat([basin_other_prod_gdf, nei_basin_other_prod_df]).reset_index(drop=True)
-    # Water Production Volumes
-    water_prod_gdf = pd.concat([water_prod_gdf, nei_water_prod_df]).reset_index(drop=True)
-    # Well Completions
-    conv_well_comp_gdf = pd.concat([conv_well_comp_gdf, nei_conv_well_comp_df]).reset_index(drop=True)
-    hf_well_comp_gdf = pd.concat([hf_well_comp_gdf, nei_hf_well_comp_df]).reset_index(drop=True)
-    # Drilled Gas Wells
-    drilled_well_gdf = pd.concat([drilled_well_gdf, nei_drilled_well_df]).reset_index(drop=True)
-    # Offshore Well Counts and Production Volumes in State Waters in the Gulf of Mexico
-    state_gom_offshore_well_count_gdf = state_gom_offshore_well_count_df.reset_index(drop=True)  # No IL/IN data to add
-    state_gom_offshore_well_prod_gdf = state_gom_offshore_well_prod_df.reset_index(drop=True)  # No IL/IN data to add
-
-    # Output Proxy Parquet Files
-    all_well_count_gdf.to_parquet(all_well_count_output_path)
-    conv_well_count_gdf.to_parquet(conv_well_count_output_path)
-    hf_well_count_gdf.to_parquet(hf_well_count_output_path)
-    all_well_prod_gdf.to_parquet(all_well_prod_output_path)
-    basin_220_prod_gdf.to_parquet(basin_220_prod_output_path)
-    basin_395_prod_gdf.to_parquet(basin_395_prod_output_path)
-    basin_430_prod_gdf.to_parquet(basin_430_prod_output_path)
-    basin_other_prod_gdf.to_parquet(basin_other_prod_output_path)
-    water_prod_gdf.to_parquet(water_prod_output_path)
-    conv_well_comp_gdf.to_parquet(conv_well_comp_output_path)
-    hf_well_comp_gdf.to_parquet(hf_well_comp_output_path)
-    drilled_well_gdf.to_parquet(drilled_well_output_path)
-    state_gom_offshore_well_count_gdf.to_parquet(state_gom_offshore_well_count_output_path)
-    state_gom_offshore_well_prod_gdf.to_parquet(state_gom_offshore_well_prod_output_path)
-    return None
-
-
-
-    
-
diff --git a/gch4i/proxy_processing/task_ng_water_prod_proxy.py b/gch4i/proxy_processing/task_ng_water_prod_proxy.py
new file mode 100644
index 0000000..9f4ad24
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_water_prod_proxy.py
@@ -0,0 +1,194 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    ng_water_prod_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="ng_water_prod_proxy")
+def task_get_ng_water_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    water_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ng_water_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    water_prod_df = pd.DataFrame()
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        ng_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_GAS > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio > 100 | GOR_QUAL == 'Gas only'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            gas_prod_str = 'GASPROD_'+imonth_str
+            water_prod_str = 'WATERPROD_'+imonth_str
+            # Onshore data for imonth
+            ng_data_imonth_temp = (ng_data_temp
+                                   .query(f"{gas_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            ng_data_imonth_temp = (ng_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',gas_prod_str,water_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Water Production
+            # Data Source by state defined in Enverus DrillingInfo Processing - Produced
+            # Water_2023-11-14_forGridding.xlsx file.
+            if iyear < 2016:  # WV uses NEI data
+                water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA',
+                                             'MI','MO','MS','MT','ND','NE','NM','NV',
+                                             'NY','OH','SD','TX','UT','VA','WY'
+                                             ]
+                # States using NEI for reference: ['IL','IN','KS','OK','PA','WV']
+            else:  # 2016 and beyond; WV uses Enverus data
+                water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA',
+                                             'MI','MO','MS','MT','ND','NE','NM','NV',
+                                             'NY','OH','SD','TX','UT','VA','WY','WV'
+                                             ]  #WV uses Enverus
+                # States using NEI for reference: ['IL','IN','KS','OK','PA']
+            # Enverus water production for applicable states (NEI water producted will
+            # be added in the NEI section of the code below)
+            water_prod_imonth = (ng_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',water_prod_str]]
+                                .query("STATE_CODE.isin(@water_prod_enverus_states)")
+                                .assign(proxy_data=lambda df: df[water_prod_str])
+                                .drop(columns=[water_prod_str])
+                                .rename(columns=lambda x: str(x).lower())
+                                .reset_index(drop=True)
+                                )
+            water_prod_df = pd.concat([water_prod_df,water_prod_imonth])
+
+    # Delete unused temp data
+    del ng_data_temp
+    del ng_data_imonth_temp
+    del water_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe   
+    water_prod_df = calc_enverus_rel_emi(water_prod_df)    
+    water_prod_df = enverus_df_to_gdf(water_prod_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Gas Production
+        ifile_name = get_nei_file_name(nei_data_year, ng_water_prod_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    water_prod_df = pd.concat([water_prod_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = water_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    water_prod_df = water_prod_df.astype({'year':str})
+    water_prod_df.to_parquet(water_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_ng_well_blowout_proxy.py b/gch4i/proxy_processing/task_ng_well_blowout_proxy.py
new file mode 100644
index 0000000..4d7604d
--- /dev/null
+++ b/gch4i/proxy_processing/task_ng_well_blowout_proxy.py
@@ -0,0 +1,75 @@
+# %%
+import calendar
+import datetime
+from pathlib import Path
+from typing import Annotated
+from zipfile import ZipFile
+
+import geopandas as gpd
+import numpy as np
+import osgeo
+import pandas as pd
+import seaborn as sns
+from pyarrow import parquet
+from pytask import Product, mark, task
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    ghgi_data_dir_path,
+    global_data_dir_path,
+    max_year,
+    min_year,
+    proxy_data_dir_path,
+)
+from gch4i.utils import name_formatter
+
+# %%
+
+
+@mark.persist
+@task(id="ng_well_blowout_proxy")
+def task_get_ng_well_blowout_proxy_data(
+    output_path: Annotated[Path, Product] = (proxy_data_dir_path / "ng_well_blowout_proxy.parquet"),
+):
+    """
+    Well blowouts occur three times over 2012-2022. These locations and emissions are
+    provided directly by the GHGI sector leads and manually coded into the proxy.
+
+        1. LA in 2019
+           state_code: LA; year: 2019; emi: 49 kt; lat: 32.1; lon: -93.4
+        2. OH in 2018
+           state_code: OH; year: 2018; emi: 60 kt; lat: 39.864; lon: -80.861
+        3. TX 2019
+           state_code: TX; year: 2019; emi: 4.8 kt; lat: 28.9; lon: -97.6
+    
+    """
+
+    well_blowout_df = pd.DataFrame(
+        {'state_code': ['LA', 'OH', 'TX'],
+         'year': [2019, 2018, 2019],
+         'rel_emi': [1.0, 1.0, 1.0],  # assign 100% of the emissions to each state/year combination
+         'lat': [32.1, 39.864, 28.9],
+         'lon': [-93.4, -80.861, -97.6],
+    })
+
+    well_blowout_gdf = (gpd.GeoDataFrame(
+        well_blowout_df,
+        geometry=gpd.points_from_xy(
+            well_blowout_df["lon"],
+            well_blowout_df["lat"],
+            crs=4326
+            )
+        )
+        .drop(columns=["lat", "lon"])
+        .loc[:, ["year", "state_code", "rel_emi", "geometry"]]
+    )
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = well_blowout_gdf.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    well_blowout_gdf = well_blowout_gdf.astype({'year':str})
+    well_blowout_gdf.to_parquet(output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_all_well_count_proxy.py b/gch4i/proxy_processing/task_oil_all_well_count_proxy.py
new file mode 100644
index 0000000..299afc9
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_all_well_count_proxy.py
@@ -0,0 +1,176 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    oil_well_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="oil_all_well_count_proxy")
+def task_get_oil_all_well_count_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    all_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_all_well_count_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated oil well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly oil or ng prod > 0
+
+    # Proxy Data Dataframes:
+    all_well_count_df = pd.DataFrame()  # Active well (conventional + HF) counts in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_OIL > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio <= 100")
+                        .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                        )
+
+        # Include wells in map only for months where there is oil production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                   .query(f"{oil_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # All Oil Well Count
+            all_well_count_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT']]
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .rename(columns={"well_count":"proxy_data"})
+                                    .reset_index(drop=True)
+                                    )
+            all_well_count_df = pd.concat([all_well_count_df,all_well_count_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del all_well_count_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    all_well_count_df = calc_enverus_rel_emi(all_well_count_df)
+    all_well_count_df = enverus_df_to_gdf(all_well_count_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, oil_well_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    all_well_count_df = pd.concat([all_well_count_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = all_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    all_well_count_df = all_well_count_df.astype({'year':str})
+    all_well_count_df.to_parquet(all_well_count_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_all_well_prod_proxy.py b/gch4i/proxy_processing/task_oil_all_well_prod_proxy.py
new file mode 100644
index 0000000..bee4145
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_all_well_prod_proxy.py
@@ -0,0 +1,177 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    oil_oil_prod_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="oil_all_well_prod_proxy")
+def task_get_oil_all_well_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    all_well_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_all_well_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    all_well_prod_df = pd.DataFrame()  # Active oil well (conventional + HF) oil production in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                         .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                         .query("OFFSHORE == 'N'")
+                         .query("CUM_OIL > 0")
+                         .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                         .assign(year=str(iyear))
+                         .replace(np.inf, 0)
+                         .astype({"spud_year": str, "first_prod_year": str})
+                         .query("gas_to_oil_ratio <= 100")
+                         .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                         )
+
+        # Include wells in map only for months where there is oil production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                    .query(f"{oil_prod_str} > 0")
+                                    .assign(year_month=str(iyear)+'-'+imonth_str)
+                                    )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # All Well Oil Production
+            all_well_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',oil_prod_str]]
+                                    .assign(proxy_data=lambda df: df[oil_prod_str])
+                                    .drop(columns=[oil_prod_str])
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .reset_index(drop=True)
+                                    )
+            all_well_prod_df = pd.concat([all_well_prod_df,all_well_prod_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del all_well_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe  
+    all_well_prod_df = calc_enverus_rel_emi(all_well_prod_df)    
+    all_well_prod_df = enverus_df_to_gdf(all_well_prod_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Gas Production
+        ifile_name = get_nei_file_name(nei_data_year, oil_oil_prod_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    all_well_prod_df = pd.concat([all_well_prod_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = all_well_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    all_well_prod_df = all_well_prod_df.astype({'year':str})
+    all_well_prod_df.to_parquet(all_well_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_basin_220_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_220_prod_proxy.py
new file mode 100644
index 0000000..3258c9b
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_basin_220_prod_proxy.py
@@ -0,0 +1,154 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+)
+
+# %%
+@mark.persist
+@task(id="oil_basin_220_prod_proxy")
+def task_get_oil_basin_220_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    basin_220_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_220_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    basin_220_prod_df = pd.DataFrame()  # Oil well oil production in Basin 220 in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                         .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                         .query("OFFSHORE == 'N'")
+                         .query("CUM_OIL > 0")
+                         .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                         .assign(year=str(iyear))
+                         .replace(np.inf, 0)
+                         .astype({"spud_year": str, "first_prod_year": str})
+                         .query("gas_to_oil_ratio <= 100")
+                         .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                         )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                    .query(f"{oil_prod_str} > 0")
+                                    .assign(year_month=str(iyear)+'-'+imonth_str)
+                                    )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Basin 220 Oil Production
+            basin_220_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]]
+                                     .query("AAPG_CODE_ERG == '220'")
+                                     .assign(proxy_data=lambda df: df[oil_prod_str])
+                                     .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG'])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .reset_index(drop=True)
+                                     )
+            basin_220_prod_df = pd.concat([basin_220_prod_df,basin_220_prod_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del basin_220_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    basin_220_prod_df = calc_enverus_rel_emi(basin_220_prod_df)  
+    basin_220_prod_df = enverus_df_to_gdf(basin_220_prod_df)
+
+    # NEI Data:
+    # No addition of NEI data because IL and IN are not in this basin. We are adding
+    # them to the "other" basin.
+    
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = basin_220_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    basin_220_prod_df = basin_220_prod_df.astype({'year':str})
+    basin_220_prod_df.to_parquet(basin_220_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_basin_360_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_360_prod_proxy.py
new file mode 100644
index 0000000..f970fbf
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_basin_360_prod_proxy.py
@@ -0,0 +1,154 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+)
+
+# %%
+@mark.persist
+@task(id="oil_basin_360_prod_proxy")
+def task_get_oil_basin_360_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    basin_360_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_360_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    basin_360_prod_df = pd.DataFrame()  # Oil well oil production in Basin 360 in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                         .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                         .query("OFFSHORE == 'N'")
+                         .query("CUM_OIL > 0")
+                         .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                         .assign(year=str(iyear))
+                         .replace(np.inf, 0)
+                         .astype({"spud_year": str, "first_prod_year": str})
+                         .query("gas_to_oil_ratio <= 100")
+                         .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                         )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                    .query(f"{oil_prod_str} > 0")
+                                    .assign(year_month=str(iyear)+'-'+imonth_str)
+                                    )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Basin 360 Oil Production
+            basin_360_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]]
+                                     .query("AAPG_CODE_ERG == '360'")
+                                     .assign(proxy_data=lambda df: df[oil_prod_str])
+                                     .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG'])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .reset_index(drop=True)
+                                     )
+            basin_360_prod_df = pd.concat([basin_360_prod_df,basin_360_prod_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del basin_360_prod_imonth
+
+    # Calculate Relative Emissions
+    basin_360_prod_df = calc_enverus_rel_emi(basin_360_prod_df)    
+    basin_360_prod_df = enverus_df_to_gdf(basin_360_prod_df)
+
+    # NEI Data:
+    # No addition of NEI data because IL and IN are not in this basin. We are adding
+    # them to the "other" basin.
+    
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = basin_360_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    basin_360_prod_df = basin_360_prod_df.astype({'year':str})
+    basin_360_prod_df.to_parquet(basin_360_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_basin_395_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_395_prod_proxy.py
new file mode 100644
index 0000000..c5bea89
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_basin_395_prod_proxy.py
@@ -0,0 +1,154 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+)
+
+# %%
+@mark.persist
+@task(id="oil_basin_395_prod_proxy")
+def task_get_oil_basin_395_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    basin_395_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_395_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    basin_395_prod_df = pd.DataFrame()  # Oil well oil production in Basin 395 in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                         .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                         .query("OFFSHORE == 'N'")
+                         .query("CUM_OIL > 0")
+                         .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                         .assign(year=str(iyear))
+                         .replace(np.inf, 0)
+                         .astype({"spud_year": str, "first_prod_year": str})
+                         .query("gas_to_oil_ratio <= 100")
+                         .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                         )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                    .query(f"{oil_prod_str} > 0")
+                                    .assign(year_month=str(iyear)+'-'+imonth_str)
+                                    )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Basin 395 Oil Production
+            basin_395_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]]
+                                     .query("AAPG_CODE_ERG == '395'")
+                                     .assign(proxy_data=lambda df: df[oil_prod_str])
+                                     .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG'])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .reset_index(drop=True)
+                                     )
+            basin_395_prod_df = pd.concat([basin_395_prod_df,basin_395_prod_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del basin_395_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    basin_395_prod_df = calc_enverus_rel_emi(basin_395_prod_df)
+    basin_395_prod_df = enverus_df_to_gdf(basin_395_prod_df)
+
+    # NEI Data:
+    # No addition of NEI data because IL and IN are not in this basin. We are adding
+    # them to the "other" basin.
+    
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = basin_395_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    basin_395_prod_df = basin_395_prod_df.astype({'year':str})
+    basin_395_prod_df.to_parquet(basin_395_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_basin_430_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_430_prod_proxy.py
new file mode 100644
index 0000000..f5083d6
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_basin_430_prod_proxy.py
@@ -0,0 +1,154 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+)
+
+# %%
+@mark.persist
+@task(id="oil_basin_430_prod_proxy")
+def task_get_oil_basin_430_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    basin_430_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_430_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    basin_430_prod_df = pd.DataFrame()  # Oil well oil production in Basin 430 in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                         .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                         .query("OFFSHORE == 'N'")
+                         .query("CUM_OIL > 0")
+                         .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                         .assign(year=str(iyear))
+                         .replace(np.inf, 0)
+                         .astype({"spud_year": str, "first_prod_year": str})
+                         .query("gas_to_oil_ratio <= 100")
+                         .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                         )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                    .query(f"{oil_prod_str} > 0")
+                                    .assign(year_month=str(iyear)+'-'+imonth_str)
+                                    )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Basin 430 Oil Production
+            basin_430_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]]
+                                     .query("AAPG_CODE_ERG == '430'")
+                                     .assign(proxy_data=lambda df: df[oil_prod_str])
+                                     .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG'])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .reset_index(drop=True)
+                                     )
+            basin_430_prod_df = pd.concat([basin_430_prod_df,basin_430_prod_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del basin_430_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    basin_430_prod_df = calc_enverus_rel_emi(basin_430_prod_df)
+    basin_430_prod_df = enverus_df_to_gdf(basin_430_prod_df)
+
+    # NEI Data:
+    # No addition of NEI data because IL and IN are not in this basin. We are adding
+    # them to the "other" basin.
+    
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = basin_430_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    basin_430_prod_df = basin_430_prod_df.astype({'year':str})
+    basin_430_prod_df.to_parquet(basin_430_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_basin_other_prod_proxy.py b/gch4i/proxy_processing/task_oil_basin_other_prod_proxy.py
new file mode 100644
index 0000000..e7fde25
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_basin_other_prod_proxy.py
@@ -0,0 +1,178 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    oil_oil_prod_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="oil_basin_other_prod_proxy")
+def task_get_oil_basin_other_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    basin_other_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_basin_other_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    basin_other_prod_df = pd.DataFrame()  # Oil well oil production in Other Basins in a given month
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                         .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                         .query("OFFSHORE == 'N'")
+                         .query("CUM_OIL > 0")
+                         .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                         .assign(year=str(iyear))
+                         .replace(np.inf, 0)
+                         .astype({"spud_year": str, "first_prod_year": str})
+                         .query("gas_to_oil_ratio <= 100")
+                         .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                         )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                    .query(f"{oil_prod_str} > 0")
+                                    .assign(year_month=str(iyear)+'-'+imonth_str)
+                                    )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # "Other" Basin Gas Production
+            basin_other_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','AAPG_CODE_ERG',oil_prod_str]]
+                                       .query("AAPG_CODE_ERG != '220' & AAPG_CODE_ERG != '360' & AAPG_CODE_ERG != '395' & AAPG_CODE_ERG != '430'")
+                                       .assign(proxy_data=lambda df: df[oil_prod_str])
+                                       .drop(columns=[oil_prod_str, 'AAPG_CODE_ERG'])
+                                       .rename(columns=lambda x: str(x).lower())
+                                       .reset_index(drop=True)
+                                       )
+            basin_other_prod_df = pd.concat([basin_other_prod_df,basin_other_prod_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del basin_other_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    basin_other_prod_df = calc_enverus_rel_emi(basin_other_prod_df)
+    basin_other_prod_df = enverus_df_to_gdf(basin_other_prod_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Gas Production
+        ifile_name = get_nei_file_name(nei_data_year, oil_oil_prod_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    basin_other_prod_df = pd.concat([basin_other_prod_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = basin_other_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    basin_other_prod_df = basin_other_prod_df.astype({'year':str})
+    basin_other_prod_df.to_parquet(basin_other_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_conv_well_comp_proxy.py b/gch4i/proxy_processing/task_oil_conv_well_comp_proxy.py
new file mode 100644
index 0000000..6129357
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_conv_well_comp_proxy.py
@@ -0,0 +1,180 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    oil_comp_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="oil_conv_well_comp_proxy")
+def task_get_oil_conv_well_comp_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    conv_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_conv_well_comp_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    conv_well_comp_df = pd.DataFrame()  # Conventional well completions
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_OIL > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio <= 100")
+                        .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                   .query(f"{oil_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Conventional Well Completions
+            conv_well_comp_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']]
+                                    .query("HF != 'Y'")
+                                    .drop(columns=["HF"])
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .rename(columns={"well_count":"proxy_data"})
+                                    .query(f"comp_year_month == '{year_month_str}'")
+                                    .drop(columns=["comp_year_month"])
+                                    .reset_index(drop=True)
+                                    )
+            conv_well_comp_df = pd.concat([conv_well_comp_df,conv_well_comp_imonth])
+    
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del conv_well_comp_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    conv_well_comp_df  = calc_enverus_rel_emi(conv_well_comp_df )
+    conv_well_comp_df  = enverus_df_to_gdf(conv_well_comp_df )
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, oil_comp_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    conv_well_comp_df = pd.concat([conv_well_comp_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = conv_well_comp_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    conv_well_comp_df = conv_well_comp_df.astype({'year':str})
+    conv_well_comp_df.to_parquet(conv_well_comp_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_conv_well_count_proxy.py b/gch4i/proxy_processing/task_oil_conv_well_count_proxy.py
new file mode 100644
index 0000000..52fd54a
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_conv_well_count_proxy.py
@@ -0,0 +1,177 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    oil_well_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="oil_conv_well_count_proxy")
+def task_get_oil_conv_well_count_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    conv_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_conv_well_count_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated oil well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly oil or ng prod > 0
+
+    # Proxy Data Dataframes:
+    conv_well_count_df = pd.DataFrame()  # Active conventional well counts in a given month
+
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_OIL > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio <= 100")
+                        .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                        )
+
+        # Include wells in map only for months where there is oil production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                   .query(f"{oil_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Conventional Well Count
+            conv_well_count_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']]
+                                     .query("HF != 'Y'")
+                                     .drop(columns=["HF"])
+                                     .rename(columns=lambda x: str(x).lower())
+                                     .rename(columns={"well_count":"proxy_data"})
+                                     .reset_index(drop=True)
+                                     )
+            conv_well_count_df = pd.concat([conv_well_count_df,conv_well_count_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del conv_well_count_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    conv_well_count_df = calc_enverus_rel_emi(conv_well_count_df)    
+    conv_well_count_df = enverus_df_to_gdf(conv_well_count_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, oil_well_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+
+    # Add NEI Data to Enverus Data
+    conv_well_count_df = pd.concat([conv_well_count_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = conv_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    conv_well_count_df = conv_well_count_df.astype({'year':str})
+    conv_well_count_df.to_parquet(conv_well_count_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_drilled_well_proxy.py b/gch4i/proxy_processing/task_oil_drilled_well_proxy.py
new file mode 100644
index 0000000..9ba7de2
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_drilled_well_proxy.py
@@ -0,0 +1,181 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    oil_spud_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="oil_drilled_well_proxy")
+def task_get_oil_drilled_well_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    drilled_well_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_drilled_well_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    drilled_well_df = pd.DataFrame()  # Gas wells drilled
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_OIL > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio <= 100")
+                        .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                   .query(f"{oil_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Drilled Oil Wells
+            drilled_well_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','spud_year','first_prod_year']]
+                                  .rename(columns=lambda x: str(x).lower())
+                                  .rename(columns={"well_count":"proxy_data"})
+                                  # wells with a spud date or first production date in the current year
+                                  .query(f"spud_year == '{iyear}' | first_prod_year == '{iyear}'")
+                                  # wells with a spud_year == iyear or if no spud date, first_prod_year == iyear
+                                  .query(f"spud_year == '{iyear}' | spud_year == 'NaN' | spud_year == 'nan'")
+                                  .drop(columns=['hf', 'spud_year', 'first_prod_year'])
+                                  .reset_index(drop=True)
+                                  )
+            drilled_well_df = pd.concat([drilled_well_df,drilled_well_imonth])
+    
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del drilled_well_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    drilled_well_df = calc_enverus_rel_emi(drilled_well_df)
+    drilled_well_df = enverus_df_to_gdf(drilled_well_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, oil_spud_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    drilled_well_df = pd.concat([drilled_well_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = drilled_well_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    drilled_well_df = drilled_well_df.astype({'year':str})
+    drilled_well_df.to_parquet(drilled_well_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_hf_well_comp_proxy.py b/gch4i/proxy_processing/task_oil_hf_well_comp_proxy.py
new file mode 100644
index 0000000..5b4ab30
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_hf_well_comp_proxy.py
@@ -0,0 +1,180 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    oil_comp_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="oil_hf_well_comp_proxy")
+def task_get_oil_hf_well_comp_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    hf_well_comp_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_hf_well_comp_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    hf_well_comp_df = pd.DataFrame()  # HF well completions
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_OIL > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio <= 100")
+                        .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                        )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                   .query(f"{oil_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # HF Well Completions
+            hf_well_comp_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF','comp_year_month']]
+                                    .query("HF == 'Y'")
+                                    .drop(columns=["HF"])
+                                    .rename(columns=lambda x: str(x).lower())
+                                    .rename(columns={"well_count":"proxy_data"})
+                                    .query(f"comp_year_month == '{year_month_str}'")
+                                    .drop(columns=["comp_year_month"])
+                                    .reset_index(drop=True)
+                                    )
+            hf_well_comp_df = pd.concat([hf_well_comp_df,hf_well_comp_imonth])
+    
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del hf_well_comp_imonth
+
+    # Calculate relative emissions and convert to a geodataframe
+    hf_well_comp_df  = calc_enverus_rel_emi(hf_well_comp_df )
+    hf_well_comp_df  = enverus_df_to_gdf(hf_well_comp_df )
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, oil_comp_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    hf_well_comp_df = pd.concat([hf_well_comp_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = hf_well_comp_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    hf_well_comp_df = hf_well_comp_df.astype({'year':str})
+    hf_well_comp_df.to_parquet(hf_well_comp_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_hf_well_count_proxy.py b/gch4i/proxy_processing/task_oil_hf_well_count_proxy.py
new file mode 100644
index 0000000..3d17a08
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_hf_well_count_proxy.py
@@ -0,0 +1,177 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    oil_well_count_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="oil_hf_well_count_proxy")
+def task_get_oil_hf_well_count_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    hf_well_count_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_hf_well_count_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated oil well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly oil or ng prod > 0
+
+    # Proxy Data Dataframes:
+    hf_well_count_df = pd.DataFrame()  # Active HF well counts in a given month
+
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                        .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                        .query("OFFSHORE == 'N'")
+                        .query("CUM_OIL > 0")
+                        .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                        .assign(year=str(iyear))
+                        .replace(np.inf, 0)
+                        .astype({"spud_year": str, "first_prod_year": str})
+                        .query("gas_to_oil_ratio <= 100")
+                        .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                        )
+
+        # Include wells in map only for months where there is oil production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                   .query(f"{oil_prod_str} > 0")
+                                   .assign(year_month=str(iyear)+'-'+imonth_str)
+                                   )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # HF Well Count
+            hf_well_count_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE','WELL_COUNT','HF']]
+                                   .query("HF == 'Y'")
+                                   .drop(columns=["HF"])
+                                   .rename(columns=lambda x: str(x).lower())
+                                   .rename(columns={"well_count":"proxy_data"})
+                                   .reset_index(drop=True)
+                                   )
+            hf_well_count_df = pd.concat([hf_well_count_df,hf_well_count_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del hf_well_count_imonth
+
+    # Calculate relative emissions and convert to a geodataframe  
+    hf_well_count_df = calc_enverus_rel_emi(hf_well_count_df)    
+    hf_well_count_df = enverus_df_to_gdf(hf_well_count_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Well Count
+        ifile_name = get_nei_file_name(nei_data_year, oil_well_count_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+
+    # Add NEI Data to Enverus Data
+    hf_well_count_df = pd.concat([hf_well_count_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = hf_well_count_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    hf_well_count_df = hf_well_count_df.astype({'year':str})
+    hf_well_count_df.to_parquet(hf_well_count_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_water_prod_proxy.py b/gch4i/proxy_processing/task_oil_water_prod_proxy.py
new file mode 100644
index 0000000..c9fb4a2
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_water_prod_proxy.py
@@ -0,0 +1,195 @@
+# %%
+from pathlib import Path
+import os
+from typing import Annotated
+from zipfile import ZipFile
+import calendar
+import datetime
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+import shapefile as shp
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+    global_data_dir_path,
+    sector_data_dir_path,
+    max_year,
+    min_year,
+    years,
+)
+
+from gch4i.utils import us_state_to_abbrev
+from gch4i.proxy_processing.ng_oil_production_utils import (
+    calc_enverus_rel_emi,
+    enverus_df_to_gdf,
+    nei_data_years,
+    get_nei_file_name,
+    oil_water_prod_file_names,
+    get_raw_NEI_data,
+)
+
+# %%
+@mark.persist
+@task(id="oil_water_prod_proxy")
+def task_get_oil_water_prod_proxy_data(
+    state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
+    enverus_production_path: Path = sector_data_dir_path / "enverus/production",
+    intermediate_outputs_path: Path = enverus_production_path / "intermediate_outputs",
+    nei_path: Path = sector_data_dir_path / "nei_og",
+    water_prod_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_water_prod_proxy.parquet",
+    ):
+    """
+    Data come from Enverus, both Drilling Info and Prism
+    The reason 2 datasets are used is because Prism does not include all states
+    So remaining states, or those with more DI coverage are taken from DI
+
+    DI: KS, MD, MI, MO, OK, TN
+
+    Prism: AK, AL, AR, AZ, CA, CAO (California Offshore), CO, FL, KY, LA, MS, MT, ND,
+    NE, NGOM (federal offshore waters in the Gulf of Mexico), NM, NV, NY, OH, OR, PA,
+    SD, TX, UT, VA, WV, WY
+
+    States with no Enverus Data: CT, DE, DC, GA, HI, ID, IL*, IN*, IA, ME, MA, MN, NH,
+    NJ, NC, RI, SC, VT, WA, WI, US territories. These states are assumed to have no oil
+    and gas production with an exception for IL and IN.
+
+    *IL and IN do not report to Enverus, but do have oil and gas production. Production
+    data is taken from the Energy Information Administration (EIA).
+
+    """
+
+    # Load in State ANSI data
+    state_gdf = (
+        gpd.read_file(state_path)
+        .loc[:, ["NAME", "STATEFP", "STUSPS", "geometry"]]
+        .rename(columns=str.lower)
+        .rename(columns={"stusps": "state_code", "name": "state_name"})
+        .astype({"statefp": int})
+        # get only lower 48 + DC
+        .query("(statefp < 60) & (statefp != 2) & (statefp != 15)")
+        .reset_index(drop=True)
+        .to_crs(4326)
+    )
+
+    # Make Annual gridded arrays (maps) of well data (a well will be counted every month if there is any production that year)
+    # Includes NA Gas Wells and Production onshore in the CONUS region
+    # source emissions are related to the presence of a well and its production status (no emission if no production)
+    # Details: ERG does not include a well in the national count if there is no (cummulative) oil or gas production from that well.
+    # Wells are not considered active for a given year if there is no production data that year
+    # This may cause wells that are coadmpleted but not yet producing to be dropped from the national count. 
+    # ERG has developed their own logic to determine if a well is an HF well or not and that result is included in the 
+    # HF variable in this dataset. This method does not rely on the Enverus well 'Producing Status'
+    # Well Type (e.g., non-associated gas well) is determined based on annual production GOR at that well (CUM OIL/ CUM GAS), 
+    # but the presence of a well will only be included in maps in months where monthly gas prod > 0
+
+    # Proxy Data Dataframes:
+    water_prod_df = pd.DataFrame()
+
+    ## Enverus DI and Prism Data: 
+    # Read in and query formatted and corrrected Enverus data to create dictionaries of 
+    # proxy data (Enverus data is from task_enverus_di_prism_data_processing.py)
+    for iyear in years:
+        enverus_file_name_iyear = f"formatted_raw_enverus_tempoutput_{iyear}.csv"
+        enverus_file_path_iyear = os.path.join(intermediate_outputs_path, enverus_file_name_iyear)
+        oil_data_temp = (pd.read_csv(enverus_file_path_iyear, dtype={3:'str', 'spud_year': str, 'first_prod_year': str})
+                         .query("STATE_CODE.isin(@state_gdf['state_code'])")
+                         .query("OFFSHORE == 'N'")
+                         .query("CUM_OIL > 0")
+                         .assign(gas_to_oil_ratio=lambda df: df['CUM_GAS']/df['CUM_OIL'])
+                         .assign(year=str(iyear))
+                         .replace(np.inf, 0)
+                         .astype({"spud_year": str, "first_prod_year": str})
+                         .query("gas_to_oil_ratio <= 100")
+                         .query("GOR_QUAL == 'Liq only' | GOR_QUAL == 'Liq+Gas'")
+                         )
+
+        # Include wells in map only for months where there is gas production (emissions ~ when production is occuring)
+        for imonth in range(1,13):
+            imonth_str = f"{imonth:02}"  # convert to 2-digit months
+            year_month_str = str(iyear)+'-'+imonth_str
+            oil_prod_str = 'OILPROD_'+imonth_str
+            water_prod_str = 'WATERPROD_'+imonth_str
+            # Onshore data for imonth
+            oil_data_imonth_temp = (oil_data_temp
+                                    .query(f"{oil_prod_str} > 0")
+                                    .assign(year_month=str(iyear)+'-'+imonth_str)
+                                    )
+            oil_data_imonth_temp = (oil_data_imonth_temp[[
+                'year', 'year_month','STATE_CODE','AAPG_CODE_ERG','LATITUDE','LONGITUDE',
+                'HF','WELL_COUNT',oil_prod_str,water_prod_str,
+                'comp_year_month','spud_year','first_prod_year']]
+                )
+            # Water Production
+            # Data Source by state defined in Enverus DrillingInfo Processing - Produced
+            # Water_2023-11-14_forGridding.xlsx file.
+            if iyear < 2016:  # WV uses NEI data
+                water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA',
+                                             'MI','MO','MS','MT','ND','NE','NM','NV',
+                                             'NY','OH','SD','TX','UT','VA','WY'
+                                             ]
+                # States using NEI for reference: ['IL','IN','KS','OK','PA','WV']
+            else:  # 2016 and beyond; WV uses Enverus data
+                water_prod_enverus_states = ['AK','AL','AR','AZ','CA','CO','FL','LA',
+                                             'MI','MO','MS','MT','ND','NE','NM','NV',
+                                             'NY','OH','SD','TX','UT','VA','WY','WV'
+                                             ]  #WV uses Enverus
+                # States using NEI for reference: ['IL','IN','KS','OK','PA']
+            # Enverus water production for applicable states (NEI water producted will
+            # be added in the NEI section of the code below)
+            water_prod_imonth = (oil_data_imonth_temp[['year','year_month','STATE_CODE','LATITUDE','LONGITUDE',water_prod_str]]
+                                .query("STATE_CODE.isin(@water_prod_enverus_states)")
+                                .assign(proxy_data=lambda df: df[water_prod_str])
+                                .drop(columns=[water_prod_str])
+                                .rename(columns=lambda x: str(x).lower())
+                                .reset_index(drop=True)
+                                )
+            water_prod_df = pd.concat([water_prod_df,water_prod_imonth])
+
+    # Delete unused temp data
+    del oil_data_temp
+    del oil_data_imonth_temp
+    del water_prod_imonth
+
+    # Calculate relative emissions and convert to a geodataframe   
+    water_prod_df = calc_enverus_rel_emi(water_prod_df)    
+    water_prod_df = enverus_df_to_gdf(water_prod_df)
+
+    # NEI Data:
+    nei_df = pd.DataFrame()
+
+    for iyear in years:
+        nei_data_year = nei_data_years[nei_data_years['year'] == iyear]['nei_data'].values[0]
+        # Gas Production
+        ifile_name = get_nei_file_name(nei_data_year, oil_water_prod_file_names)
+        nei_iyear = get_raw_NEI_data(iyear, nei_data_year, ifile_name)
+        nei_df = pd.concat([nei_df, nei_iyear])
+    
+    # Convert NEI Data to GDF and polygon to centroid point
+    nei_df = gpd.GeoDataFrame(nei_df, crs=4326)
+    nei_df = nei_df.to_crs(3857)  # projected CRS for centroid calculation
+    nei_df.loc[:, 'geometry'] = nei_df.loc[:, 'geometry'].centroid
+    nei_df = nei_df.to_crs(4326)
+    
+    # Add NEI Data to Enverus Data
+    water_prod_df = pd.concat([water_prod_df, nei_df]).reset_index(drop=True)
+
+    # Delete unused temp data
+    del nei_iyear
+    del nei_df
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    sums = water_prod_df.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    water_prod_df = water_prod_df.astype({'year':str})
+    water_prod_df.to_parquet(water_prod_output_path)
+
+    return None
diff --git a/gch4i/proxy_processing/task_oil_well_avg_proxy.py b/gch4i/proxy_processing/task_oil_well_avg_proxy.py
new file mode 100644
index 0000000..5336f4f
--- /dev/null
+++ b/gch4i/proxy_processing/task_oil_well_avg_proxy.py
@@ -0,0 +1,69 @@
+# %%
+from pathlib import Path
+from typing import Annotated
+
+from pyarrow import parquet
+import pandas as pd
+import osgeo
+import geopandas as gpd
+import numpy as np
+import seaborn as sns
+from pytask import Product, task, mark
+
+from gch4i.config import (
+    V3_DATA_PATH,
+    proxy_data_dir_path,
+)
+
+# %%
+@mark.persist
+@task(id="oil_well_avg_proxy")
+def task_get_oil_well_avg_proxy_data(
+    oil_all_well_prod_proxy_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_all_well_prod_proxy.parquet",
+    oil_well_avg_output_path: Annotated[Path, Product] = proxy_data_dir_path / "oil_well_avg_proxy.parquet",
+    ):
+    """
+    This proxy is the weighted average of the well count proxy and the oil production 
+    proxy for all wells. 50% of the relative emission is assigned based on the 
+    individual well's well count, and 50% of the relative emission is assigned based 
+    on the individual well's oil production.
+
+    This file takes the relative emissions based on oil production from the
+    oil_all_well_prod_proxy, adds in a new relative emission column for well count with
+    the assumption that each location has WELL_COUNT = 1, and takes the weighted average
+    of the two relative emission types to create a new average proxy.
+
+    """
+
+    # Read in the oil production proxy and assign well count
+    well_avg_gdf = (gpd.read_parquet(oil_all_well_prod_proxy_path)
+                    .rename(columns={'rel_emi': 'prod_rel_emi'})
+                    .assign(well_count = 1.0)
+                    )
+    
+    # Convert well count into a relative emission where each state-year combination sums to 1
+    well_avg_gdf['count_rel_emi'] = well_avg_gdf.groupby(['state_code', 'year'])['well_count'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
+    well_avg_gdf = well_avg_gdf.drop(columns='well_count')
+
+    # Check that relative emissions sum to 1.0 each state/year combination
+    prod_sums = well_avg_gdf.groupby(["state_code", "year"])["prod_rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(prod_sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {prod_sums}"  # assert that the sums are close to 1
+
+    count_sums = well_avg_gdf.groupby(["state_code", "year"])["count_rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(count_sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {count_sums}"  # assert that the sums are close to 1
+
+    # Create the average relative emission with 50% weights
+    well_avg_gdf = (well_avg_gdf
+                    .assign(rel_emi=lambda df: 0.5 * (df['prod_rel_emi'] + df['count_rel_emi']))
+                    .drop(columns={'prod_rel_emi', 'count_rel_emi'})
+                    )
+    
+    # Check that relative emissions sum to 1.0 each state/year combination
+    avg_sums = well_avg_gdf.groupby(["state_code", "year"])["rel_emi"].sum()  # get sums to check normalization
+    assert np.isclose(avg_sums, 1.0, atol=1e-8).all(), f"Relative emissions do not sum to 1 for each year and state; {avg_sums}"  # assert that the sums are close to 1
+
+    # Output Proxy Parquet Files
+    well_avg_gdf = well_avg_gdf.astype({'year': str})
+    well_avg_gdf.to_parquet(oil_well_avg_output_path)
+
+    return None