Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/icap mme #166

Merged
merged 23 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies:
- xarray
#
# optional
- h5netcdf
- joblib
- lxml
- pyhdf <0.11
Expand Down
1 change: 1 addition & 0 deletions monetio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"fv3chem",
"hysplit",
"hytraj",
"icap_mme",
"ncep_grib",
"pardump",
"prepchem",
Expand Down
4 changes: 3 additions & 1 deletion monetio/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
fv3chem,
hysplit,
hytraj,
icap_mme,
ncep_grib,
pardump,
prepchem,
Expand All @@ -26,9 +27,10 @@
"fv3chem",
"hysplit",
"hytraj",
"icap_mme",
"ncep_grib",
"prepchem",
"pardump",
"prepchem",
"raqms",
]

Expand Down
278 changes: 278 additions & 0 deletions monetio/models/icap_mme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
"""
ICAP-MME
bbakernoaa marked this conversation as resolved.
Show resolved Hide resolved

"`ICAP <https://aero.und.edu/atmos/icap/index.html>`__
(International Cooperative for Aerosol Prediction)
is an international forum for aerosol forecast centres,
remote sensing data providers and lead system developers
to share best practices and discuss pressing issues
facing the operational aerosol community."

The ICAP multi-model ensemble (ICAP-MME) is constructed from
the following aerosol forecast systems:

- ECMWF CAMS
- JMA MASINGAR
- NASA GEOS
- NRL NAAPS
- NOAA GEFS-Aerosols (NGAC before 2022-09)
- Météo‐France MOCAGE
- FMI SILAM
- UKMO dust forecast (Unified Model)
- BSC MONARCH

The C4C product is a "core four multi-model consensus product",
including members that use MODIS and other observational total AOD in assimilation:

- ECMWF CAMS
- JMA MASINGAR
- NASA GEOS
- NRL NAAPS

The data is 6-hourly, each file including multiple time steps (e.g. 21 or 25).

Online visualizer: https://usgodae.org/metools/ensemble/

The files are loaded from: https://usgodae.org/ftp/outgoing/nrl/ICAP-MME/
"""

import pandas as pd

valid_filetypes = ("MMC", "C4", "MME")
valid_data_vars = (
"modeaod550",
"dustaod550",
"pm",
"seasaltaod550",
"smokeaod550",
"totaldustaod550",
)


def build_urls(dates, filetype="MMC", data_var="dustaod550", *, verbose=True):
"""Construct URLs for downloading NEPS data.

Parameters
----------
dates : datetime-like or iterable of datetime-like
Dates to download data for.
filetype : {'MMC', 'C4', 'MME'}, optional
The first MME date is 2014-11-28, and the last 2022-09-11.
The first non-MME (C4 and MMC) is 2022-09-01, and these continue to the present.
data_var : {'modeaod550', 'dustaod550', 'pm', 'seasaltaod550', \
'smokeaod550', 'totaldustaod550'}, optional

Returns
-------
urls : pd.Series
fnames : pd.Series
"""

from collections.abc import Iterable

if isinstance(dates, Iterable):
dates = pd.DatetimeIndex(dates)
else:
dates = pd.DatetimeIndex([dates])

urls = []
fnames = []
if verbose:
print("Building ICAP-MME URLs...")
base_url = "https://usgodae.org/ftp/outgoing/nrl/ICAP-MME/"

for dt in dates:
fname = "icap_{}_{}_{}.nc".format(
dt.strftime(r"%Y%m%d%H"), filetype.upper(), data_var.lower()
)
url = base_url + dt.strftime(r"%Y/%Y%m/") + fname
urls.append(url)
fnames.append(fname)

# Note: files needed for comparison
urls = pd.Series(urls, index=None)
fnames = pd.Series(fnames, index=None)
return urls, fnames


def remote_file_exists(file_url, *, verbose=True):
import requests

r = requests.head(file_url)

if r.status_code == 200:
return True
else:
if verbose:
print(f"HTTP Error {r.status_code} - {r.reason}")
return False


def retrieve(url, fname, *, download=False, verbose=True):
"""Return BytesIO of the file at `url` or
download it to path `fname` if it doesn't exist.

Parameters
----------
url : str
fname : str or path-like
download : bool, optional
If True, download the file (if it doesn't already exist)
and return ``pathlib.Path``.
Otherwise, return ``io.BytesIO`` of the file in memory.

Returns
-------
io.BytesIO or pathlib.Path
The requested netCDF file.
"""
from io import BytesIO
from pathlib import Path

import requests

p = Path(fname).absolute()

if not download:
r = requests.get(url, stream=True)
r.raise_for_status()
return BytesIO(r.content)
else:
if not p.is_file():
if verbose:
print(f"Downloading {url} to {p.as_posix()}")
r = requests.get(url, stream=True)
r.raise_for_status()
with open(p, "wb") as f:
f.write(r.content)
else:
if verbose:
print(f"File Exists: {p.as_posix()}")
return p


def _check_file_url(url, *, verbose=True):
"""
Raises
------
ValueError
If the file URL HEAD request doesn't return 200,
with info about how to check available products / data vars for the date.
"""
if not remote_file_exists(url, verbose=verbose):
raise ValueError(
f"File does not exist on ICAP HTTPS server: {url}. "
f"Check {url[:url.index('icap_')]} to see the available "
"`product` and `data_var`s for this month."
)


def open_dataset(date, product="MMC", data_var="dustaod550", *, download=False, verbose=True):
"""
Parameters
----------
date : str or datetime-like
The date for which to open the dataset.
product : {'MMC', 'C4', 'MME'}, optional
The first MME date is 2014-11-28, and the last 2022-09-11.
The first non-MME (C4 and MMC) is 2022-09-01, and these continue to the present.
data_var : {'modeaod550', 'dustaod550', 'pm', 'seasaltaod550', \
'smokeaod550', 'totaldustaod550'}, optional
Note that which data variables are available
depends on the `product` selection and the date.
download : bool, optional
If True, use files on disk, downloading if necessary.
If False, download and load dataset in memory.

Returns
-------
xarray.Dataset

Raises
------
ValueError
If input parameters are invalid or a file does not exist on the server.
"""
import pandas as pd
import xarray as xr

if not isinstance(date, pd.Timestamp):
d = pd.to_datetime(date)
else:
d = date

if product.upper() not in valid_filetypes:
raise ValueError(f"Invalid input for 'product': Valid values are {valid_filetypes}.")

if data_var.lower() not in valid_data_vars:
raise ValueError(f"Invalid input for 'data_var': Valid values are {valid_data_vars}.")

urls, fnames = build_urls(d, filetype=product, data_var=data_var, verbose=verbose)
url = urls.values[0]
fname = fnames.values[0]
_check_file_url(url, verbose=verbose)
dset = xr.open_dataset(retrieve(url, fname, download=download, verbose=verbose))

return dset


def open_mfdataset(dates, product="MMC", data_var="dustaod550", *, download=False, verbose=True):
"""
.. note::
Depending on the selected product/variable and the provided dates,
the result may have overlapping times.
You may wish to apply ``.drop_duplicates("time", keep="last")``.

Parameters
----------
dates : iterable of datetime-like
The dates for which to open the dataset.
product : {'MMC', 'C4', 'MME'}, optional
The first MME date is 2014-11-28, and the last 2022-09-11.
The first non-MME (C4 and MMC) is 2022-09-01, and these continue to the present.
data_var : {'modeaod550', 'dustaod550', 'pm', 'seasaltaod550', \
'smokeaod550', 'totaldustaod550'}, optional
Note that which data variables are available
depends on the `product` selection and the date.
download : bool, optional
If True, use files on disk, downloading if necessary.
If False, download and load dataset in memory.
In this case, the files are fully loaded instead of being opened lazily.

Returns
-------
xarray.Dataset

Raises
------
ValueError
If input parameters are invalid or a file does not exist on the server.
"""
import pandas as pd
import xarray as xr

d = pd.DatetimeIndex(dates)

if product.upper() not in valid_filetypes:
raise ValueError(f"Invalid input for 'product': Valid values are {valid_filetypes}.")

if data_var.lower() not in valid_data_vars:
raise ValueError(f"Invalid input for 'data_var': Valid values are {valid_data_vars}.")

urls, fnames = build_urls(d, filetype=product, data_var=data_var, verbose=verbose)

if download is True:
paths = []
for url, fname in zip(urls, fnames):
_check_file_url(url, verbose=verbose)
paths.append(retrieve(url, fname, download=True, verbose=verbose))
dset = xr.open_mfdataset(paths, combine="nested", concat_dim="time")
else:
dsets = []
for url, fname in zip(urls, fnames):
_check_file_url(url, verbose=verbose)
o = retrieve(url, fname, download=False, verbose=verbose)
dsets.append(xr.open_dataset(o))
dset = xr.concat(dsets, dim="time")

return dset
60 changes: 60 additions & 0 deletions tests/test_icap_mme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pytest

from monetio.models.icap_mme import open_dataset, open_mfdataset


def test_open_dataset_bad_date():
with pytest.raises(ValueError, match="File does not exist"):
open_dataset("1990-08-01")


def test_open_dataset_invalid_param():
date = "2019-08-01"

with pytest.raises(ValueError, match="Invalid input for 'product'"):
open_dataset(date, product="asdf")
open_mfdataset([date], product="asdf")

with pytest.raises(ValueError, match="Invalid input for 'data_var'"):
open_dataset(date, data_var="asdf")
open_mfdataset([date], data_var="asdf")


@pytest.mark.parametrize(
"date,product,data_var",
[
("2019-08-01", "MME", "totaldustaod550"),
("2024-02-01", "C4", "dustaod550"),
],
)
def test_open_dataset(tmp_path, monkeypatch, date, product, data_var):
ds = open_dataset(date, product=product, data_var=data_var, download=False)
assert set(ds.dims) == {"time", "lat", "lon"}

monkeypatch.chdir(tmp_path)
ds_dl = open_dataset(date, product=product, data_var=data_var, download=True)
assert len(sorted(tmp_path.glob("*.nc"))) == 1
assert set(ds_dl.dims) == {"time", "lat", "lon"}

assert ds_dl.equals(ds)


def test_open_mfdataset(tmp_path, monkeypatch):
dates = ["2023-08-01", "2023-08-02"]
product = "C4"
data_var = "dustaod550"

ds = open_mfdataset(dates, product=product, data_var=data_var, download=False)
assert set(ds.dims) == {"time", "lat", "lon"}
assert ds["dust_aod_mean"].chunks is None, "not Dask-backed"
assert (
~ds.time.to_series().duplicated(keep=False)
).sum() == 8, "all overlap except first and last day"

monkeypatch.chdir(tmp_path)
ds_dl = open_mfdataset(dates, product=product, data_var=data_var, download=True)
assert len(sorted(tmp_path.glob("*.nc"))) == 2
assert set(ds_dl.dims) == {"time", "lat", "lon"}
assert ds_dl["dust_aod_mean"].chunks is not None

assert ds_dl.equals(ds)