Skip to content

Commit

Permalink
Merge branch 'main' into ian
Browse files Browse the repository at this point in the history
  • Loading branch information
ifenty authored Nov 13, 2024
2 parents 61eb644 + 5c7a29d commit 499c3ea
Show file tree
Hide file tree
Showing 19 changed files with 72,700 additions and 362 deletions.
1,352 changes: 1,352 additions & 0 deletions AWS/NetCDF_to_JSON/generate_ecco_v4r4_jsons.ipynb

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

# ECCO Dataset Production

Tools and utilities, per the initial release, "to turn raw model
output into glorious self-describing granules for widespread
Tools and utilities, as stated in the initial release, "to turn raw
model output into glorious self-describing granules for widespread
distribution".

Original release by Duncan Bark, in collaboration with Ian Fenty,
Expand All @@ -20,7 +20,7 @@ ouput in [MITgcm file
formats](https://mitgcm.readthedocs.io/en/latest/).

For general distribution however (e.g, via
[PO.DAAC](https://podaac.jpl.nasa.gov/), it's useful to convert these
[PO.DAAC](https://podaac.jpl.nasa.gov/)), it's useful to convert these
raw output files to more intuitive day/date-stamped snapshot, daily mean,
and monthly mean files, with appropriate metadata, in both native and
latitude-longitude grid formats. Though the operations themselves are
Expand All @@ -33,7 +33,7 @@ distributed and cloud-based workflows, a central theme of ECCO Dataset
Production.

Though ECCO Dataset Production is capable of operating on extremely
large data collections, it's equally applicable to smaller, limited
large data collections, it applies just as well to smaller, limited
production data sets; in short, anything for which wider distribution
in easier-to-process formats is desired.

Expand Down Expand Up @@ -64,7 +64,7 @@ Dataset Production is configured and run:
Each will be discussed separately.


### Configuring ECCO Dataset Production to Run Locally
### Configuring ECCO Dataset Production to run locally

Installing ECCO Dataset Production locally, with input and output to
local storage, is perhaps the quickest and easiest way to get started
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,19 +95,14 @@ def ecco_make_granule( task, cfg,
merged_variable_dataset_with_all_metadata.to_netcdf(
this_task['granule'], encoding=encoding)
else:

with tempfile.TemporaryDirectory() as tmpdir:
log.info('temporary directory created: %s ', tmpdir)

# temporary directory will self-destruct at end of with block
_src = os.path.basename(this_task['granule'])
_dest = this_task['granule']

merged_variable_dataset_with_all_metadata.to_netcdf(
os.path.join(tmpdir,_src), encoding=encoding)

log.info('uploading %s to %s', os.path.join(tmpdir,_src), _dest)
ecco_aws_s3_cp.aws_s3_cp( src=os.path.join(tmpdir,_src), dest=_dest, **kwargs)
# temporary directory will self-destruct at end of with block

log.info('... done')

Expand Down
54 changes: 36 additions & 18 deletions processing/src/ecco_dataset_production/ecco_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import fnmatch
import glob
import logging
import numpy as np
import os
import tarfile
Expand All @@ -18,6 +19,9 @@
ZIPFILE_GLOBSTR = '*.gz'


log = logging.getLogger('edp.'+__name__)


class ECCOGrid(object):
"""Container class for ECCO grid access. Primarily intended to optimize i/o
performance by allowing operations, e.g. collections of ECCOMDSDataset
Expand Down Expand Up @@ -77,9 +81,9 @@ def __init__( self, task=None, grid_loc=None, **kwargs):
"""
self.task = None
self.__latlon_grid = None
self.__native_grid = None
self.__native_wet_point_indices = None
self._latlon_grid = None
self._native_grid = None
self._native_wet_point_indices = None

if task:
if not isinstance(task,ecco_task.ECCOTask):
Expand Down Expand Up @@ -123,24 +127,38 @@ def __init__( self, task=None, grid_loc=None, **kwargs):

@property
def latlon_grid(self):
"""Returns latlon grid xarray.Dataset instance, if found. Raises
RuntimeError exception if not.
"""
"""
if not self.__latlon_grid:
self.__latlon_grid = xr.open_dataset(
glob.glob(os.path.join(self.grid_dir,NETCDF_LATLON_GLOBSTR))[0],
chunks='auto')
return self.__latlon_grid
if not self._latlon_grid:
try:
self._latlon_grid = xr.open_dataset(
glob.glob(os.path.join(self.grid_dir,NETCDF_LATLON_GLOBSTR))[0],
chunks='auto')
except Exception as e:
log.error("'latlon file with name matching '%s' could either not be opened or found in grid directory '%s'",
NETCDF_LATLON_GLOBSTR, self.grid_dir)
raise RuntimeError(e)
return self._latlon_grid


@property
def native_grid(self):
"""Returns native grid xarray.Dataset instance, if found. Raises
RuntimeError exception if not.
"""
"""
if not self.__native_grid:
self.__native_grid = xr.open_dataset(
glob.glob(os.path.join(self.grid_dir,NETCDF_NATIVE_GLOBSTR))[0],
chunks='auto')
return self.__native_grid
if not self._native_grid:
try:
self._native_grid = xr.open_dataset(
glob.glob(os.path.join(self.grid_dir,NETCDF_NATIVE_GLOBSTR))[0],
chunks='auto')
except Exception as e:
log.error("'latlon file with name matching '%s' could either not be opened or found in grid directory '%s'",
NETCDF_NATIVE_GLOBSTR, self.grid_dir)
raise RuntimeError(e)
return self._native_grid


@property
Expand All @@ -150,13 +168,13 @@ def native_wet_point_indices(self):
native grid "wet" points (hFacC>0).
"""
if not self.__native_wet_point_indices:
if not self._native_wet_point_indices:
native_wet_point_indices = {}
for z in range(self.native_grid['hFacC'].shape[0]):
native_wet_point_indices[z] = \
np.where(self.native_grid['hFacC'][z,:]>0)
self.__native_wet_point_indices = native_wet_point_indices
return self.__native_wet_point_indices
self._native_wet_point_indices = native_wet_point_indices
return self._native_wet_point_indices


def __del__(self):
Expand Down
52 changes: 32 additions & 20 deletions tests/README.md
Original file line number Diff line number Diff line change
@@ -1,31 +1,43 @@

ECCO Dataset Production system-level tests
==========================================
# ECCO Dataset Production system-level tests

Functionality
-------------
## End-to-end test/demonstration examples:

- edp_aws_s3_sync: Tests various AWS S3 data sync operations: upload,
download, and copy within AWS.
The following examples can be used as a basis for creating dataset
production pipelines for other datatypes:

- edp_create_job_task_list: Tests creation of task lists based on high-level
job definition files.
- SSH\_native\_latlon\_local: Generation of native and latlon format
granules from local input, with output to local directory. See
"Configuring ECCO Dataset Production to run locally" in ../README.md
for description.

Test data:
----------
- SSH\_native\_latlon\_local\_remote: Generation of native and latlon
format granules from remote input data, with output to local
directory. See "Configuring ECCO Dataset Production to run in
local/remote mode" in ../README.md for description.

- ./data/config/: ECCO Dataset Production configuration file examples.
## Functionality

- ./data/ecco_granules/: A selection of representative ECCO Dataset
Production results data.
- edp\_aws\_s3\_sync: Tests various AWS S3 data sync operations:
upload, download, and copy within AWS.

- ./data/ecco_grids/: ECCO grid definitions.
- edp\_create\_job\_task\_list: Tests creation of task lists based on
high-level job definition files.

- ./data/ecco_results/: Sample ECCO MDS results data. See also
download_selected_data.sh helper script that downloads selected MDS files
per the data requirements in edp_create_job_task_list (above).
## Test data:

Notes
-----
- ./data/config/: ECCO Dataset Production configuration file examples.

- Tests that rely on AWS S3 access require an AWS account with login privileges.
- ./data/ecco_grids/: ECCO grid definitions and download script.

- ./data/ecco\_mapping\_factors: ECCO mapping factors (interpolation
to latlon grids) and download script.

- ./data/ecco\_results/: Sample ECCO MDS results data. See also
download\_selected\_data.sh helper script that downloads selected
MDS files used in edp\_create\_job\_task\_list (above).

## Remarks

- Tests that rely on AWS S3 access require an AWS account with login
privileges.
18 changes: 18 additions & 0 deletions tests/SSH_native_latlon_local/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

Complete example illustrating generation of native and latlon format
granules from local input, with output to local directory.

See "Configuring ECCO Dataset Production to run locally" in top-level
../../README.md for details.

Steps include:

- Task list generation:
`$ ./edp_create_job_task_list_SSH_native_latlon_mon_mean.sh`
Output file `SSH_native_latlon_mon_mean_tasks.json` can be compared
with saved results in `SSH_native_latlon_mon_mean_tasks.json.sav`

- Granule generation based on generated task list:
`$ ./edp_generate_dataproducts_SSH_native_latlon_mon_mean.sh`
Resulting granules in `./V4r4` can be compared with saved results in
`./V4r4_sav`
18 changes: 18 additions & 0 deletions tests/SSH_native_latlon_local_remote/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

Complete example illustrating generation of native and latlon format
granules from remote input, with output to local directory.

See "Configuring ECCO Dataset Production to run in local/remote mode"
in top-level ../../README.md for details.

Steps include:

- Task list generation:
`$ ./edp_create_job_task_list_SSH_native_latlon_mon_mean.sh`
Output file `SSH_native_latlon_mon_mean_tasks.json` can be compared
with saved results in `SSH_native_latlon_mon_mean_tasks.json.sav`

- Granule generation based on generated task list:
`$ ./edp_generate_dataproducts_SSH_native_latlon_mon_mean.sh`
Resulting granules in `./V4r4` can be compared with saved results in
`./V4r4_sav`
2 changes: 1 addition & 1 deletion tests/data/ecco_grids/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## ECCO grid test data.

Although already inluded here for test purposes, the ECCO grids in this directory
could have also been retrieved vi the included dowload.sh script, e.g.:
could have also been retrieved via the included dowload.sh script, e.g.:

$ download.sh -k /usr/local/bin/aws-login.darwin.amd64 -p saml-pub
11 changes: 5 additions & 6 deletions tests/edp_aws_s3_sync/README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@

AWS S3 sync tests
-----------------
# AWS S3 sync tests

Tests `aws_s3_sync` utility modes: upload (local->remote), download
(remote->local) and S3 copy (remote->remote).

A circularity test can be performed by running the three test cases in sequence:
upload, copy within AWS, and download:
A circularity test can be performed by running the three test cases in
sequence: upload, copy within AWS, and download:

- `edp_aws_s3_sync_local_remote.sh`
- `edp_aws_s3_sync_remote_remote.sh`
- `edp_aws_s3_sync_remote_local.sh'

Note that the above tests require an AWS account with login privileges.
Note that the above tests require an AWS account with login
privileges.

33 changes: 20 additions & 13 deletions tests/edp_create_job_task_list/README.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,30 @@

ECCO task list creation tests
-----------------------------
# ECCO task list creation (only) tests

Each of the included tests generates a task list from a jobs description file
(`jobs_example.txt`), and outputs the resulting list to a corresponding json
file.

- `edp_create_job_file_list_test_1.sh`: creates a task list based on
locally-stored ECCO results
- `edp_create_job_file_list_test_1.sh`: creates a task list based on
locally-stored ECCO results. Compare with results saved in
`edp_create_job_file_list_test_1.json.sav`.

- `edp_create_job_file_list_test_2.sh`: creates a task list based on limited
AWS S3-stored ECCO results (requires a prior data upload; see comments)
- `edp_create_job_file_list_test_2.sh`: creates a task list based on
limited AWS S3-stored ECCO results (requires a prior data upload;
see Remarks). Compare with results saved in
`edp_create_job_file_list_test_2.json.sav`.

- `edp_create_job_file_list_test_3.sh`: creates a task list based on full
AWS S3-stored ECCO results (requires presence of ECCO results, and
retrieving S3 bucket object list may take some time (tens of minutes)
depending on network connection speed; see comments)
- `edp_create_job_file_list_test_3.sh`: creates a task list based on
full AWS S3-stored ECCO results (requires presence of ECCO results,
and retrieving S3 bucket object list may take some time (tens of
minutes) depending on network connection speed; see Remarks).
Compare with results saved in
`edp_create_job_file_list_test_3.json.sav`.

Note that all examples reference a local config file,
`product_generation_config.yaml`, and those that reference AWS S3-stored data
require an AWS account with login privileges.
## Remarks:

- Note that all examples reference a local config file, (currently
`../../processing/configs/product_generation_config_updated.yaml`),
and those that reference AWS S3-stored data require an AWS account
with login privileges.

Loading

0 comments on commit 499c3ea

Please sign in to comment.