Merge branch 'main' into ian

ECCO-GROUP · Nov 13, 2024 · 499c3ea · 499c3ea
2 parents 61eb644 + 5c7a29d
commit 499c3ea
Show file tree

Hide file tree

Showing 19 changed files with 72,700 additions and 362 deletions.
diff --git a/AWS/NetCDF_to_JSON/generate_ecco_v4r4_jsons.ipynb b/AWS/NetCDF_to_JSON/generate_ecco_v4r4_jsons.ipynb
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 
 # ECCO Dataset Production
 
-Tools and utilities, per the initial release, "to turn raw model
-output into glorious self-describing granules for widespread
+Tools and utilities, as stated in the initial release, "to turn raw
+model output into glorious self-describing granules for widespread
 distribution".
 
 Original release by Duncan Bark, in collaboration with Ian Fenty,
@@ -20,7 +20,7 @@ ouput in [MITgcm file
 formats](https://mitgcm.readthedocs.io/en/latest/).
 
 For general distribution however (e.g, via
-[PO.DAAC](https://podaac.jpl.nasa.gov/), it's useful to convert these
+[PO.DAAC](https://podaac.jpl.nasa.gov/)), it's useful to convert these
 raw output files to more intuitive day/date-stamped snapshot, daily mean,
 and monthly mean files, with appropriate metadata, in both native and
 latitude-longitude grid formats. Though the operations themselves are
@@ -33,7 +33,7 @@ distributed and cloud-based workflows, a central theme of ECCO Dataset
 Production.
 
 Though ECCO Dataset Production is capable of operating on extremely
-large data collections, it's equally applicable to smaller, limited
+large data collections, it applies just as well to smaller, limited
 production data sets; in short, anything for which wider distribution
 in easier-to-process formats is desired.
 
@@ -64,7 +64,7 @@ Dataset Production is configured and run:
 Each will be discussed separately.
 
 
-### Configuring ECCO Dataset Production to Run Locally
+### Configuring ECCO Dataset Production to run locally
 
 Installing ECCO Dataset Production locally, with input and output to
 local storage, is perhaps the quickest and easiest way to get started

diff --git a/processing/src/ecco_dataset_production/ecco_generate_dataproducts.py b/processing/src/ecco_dataset_production/ecco_generate_dataproducts.py
@@ -95,19 +95,14 @@ def ecco_make_granule( task, cfg,
         merged_variable_dataset_with_all_metadata.to_netcdf(
             this_task['granule'], encoding=encoding)
     else:
-
         with tempfile.TemporaryDirectory() as tmpdir:
-            log.info('temporary directory created: %s ', tmpdir)
-
+            # temporary directory will self-destruct at end of with block
             _src = os.path.basename(this_task['granule'])
             _dest = this_task['granule']
-
             merged_variable_dataset_with_all_metadata.to_netcdf(
                 os.path.join(tmpdir,_src), encoding=encoding)
-
             log.info('uploading %s to %s', os.path.join(tmpdir,_src), _dest)
             ecco_aws_s3_cp.aws_s3_cp( src=os.path.join(tmpdir,_src), dest=_dest, **kwargs)
-            # temporary directory will self-destruct at end of with block
 
     log.info('... done')
 

diff --git a/processing/src/ecco_dataset_production/ecco_grid.py b/processing/src/ecco_dataset_production/ecco_grid.py
@@ -3,6 +3,7 @@
 
 import fnmatch
 import glob
+import logging
 import numpy as np
 import os
 import tarfile
@@ -18,6 +19,9 @@
 ZIPFILE_GLOBSTR = '*.gz'
 
 
+log = logging.getLogger('edp.'+__name__)
+
+
 class ECCOGrid(object):
     """Container class for ECCO grid access. Primarily intended to optimize i/o
     performance by allowing operations, e.g. collections of ECCOMDSDataset
@@ -77,9 +81,9 @@ def __init__( self, task=None, grid_loc=None, **kwargs):
 
         """
         self.task = None
-        self.__latlon_grid = None
-        self.__native_grid = None
-        self.__native_wet_point_indices = None
+        self._latlon_grid = None
+        self._native_grid = None
+        self._native_wet_point_indices = None
 
         if task:
             if not isinstance(task,ecco_task.ECCOTask):
@@ -123,24 +127,38 @@ def __init__( self, task=None, grid_loc=None, **kwargs):
 
     @property
     def latlon_grid(self):
+        """Returns latlon grid xarray.Dataset instance, if found. Raises
+        RuntimeError exception if not.
+
         """
-        """
-        if not self.__latlon_grid:
-            self.__latlon_grid = xr.open_dataset(
-                glob.glob(os.path.join(self.grid_dir,NETCDF_LATLON_GLOBSTR))[0],
-                chunks='auto')
-        return self.__latlon_grid
+        if not self._latlon_grid:
+            try:
+                self._latlon_grid = xr.open_dataset(
+                    glob.glob(os.path.join(self.grid_dir,NETCDF_LATLON_GLOBSTR))[0],
+                    chunks='auto')
+            except Exception as e:
+                log.error("'latlon file with name matching '%s' could either not be opened or found in grid directory '%s'",
+                    NETCDF_LATLON_GLOBSTR, self.grid_dir)
+                raise RuntimeError(e)
+        return self._latlon_grid
 
 
     @property
     def native_grid(self):
+        """Returns native grid xarray.Dataset instance, if found. Raises
+        RuntimeError exception if not.
+
         """
-        """
-        if not self.__native_grid:
-            self.__native_grid = xr.open_dataset(
-                glob.glob(os.path.join(self.grid_dir,NETCDF_NATIVE_GLOBSTR))[0],
-                chunks='auto')
-        return self.__native_grid
+        if not self._native_grid:
+            try:
+                self._native_grid = xr.open_dataset(
+                    glob.glob(os.path.join(self.grid_dir,NETCDF_NATIVE_GLOBSTR))[0],
+                    chunks='auto')
+            except Exception as e:
+                log.error("'latlon file with name matching '%s' could either not be opened or found in grid directory '%s'",
+                    NETCDF_NATIVE_GLOBSTR, self.grid_dir)
+                raise RuntimeError(e)
+        return self._native_grid
 
 
     @property
@@ -150,13 +168,13 @@ def native_wet_point_indices(self):
         native grid "wet" points (hFacC>0).
 
         """
-        if not self.__native_wet_point_indices:
+        if not self._native_wet_point_indices:
             native_wet_point_indices = {}
             for z in range(self.native_grid['hFacC'].shape[0]):
                 native_wet_point_indices[z] = \
                     np.where(self.native_grid['hFacC'][z,:]>0)
-            self.__native_wet_point_indices = native_wet_point_indices
-        return self.__native_wet_point_indices
+            self._native_wet_point_indices = native_wet_point_indices
+        return self._native_wet_point_indices
 
 
     def __del__(self):

diff --git a/tests/README.md b/tests/README.md
@@ -1,31 +1,43 @@
 
-ECCO Dataset Production system-level tests
-==========================================
+# ECCO Dataset Production system-level tests
 
-Functionality
--------------
+## End-to-end test/demonstration examples:
 
-    - edp_aws_s3_sync: Tests various AWS S3 data sync operations: upload,
-      download, and copy within AWS.
+The following examples can be used as a basis for creating dataset
+production pipelines for other datatypes:
 
-    - edp_create_job_task_list: Tests creation of task lists based on high-level
-      job definition files.
+- SSH\_native\_latlon\_local: Generation of native and latlon format
+  granules from local input, with output to local directory. See
+  "Configuring ECCO Dataset Production to run locally" in ../README.md
+  for description.
 
-Test data:
-----------
+- SSH\_native\_latlon\_local\_remote: Generation of native and latlon
+  format granules from remote input data, with output to local
+  directory.  See "Configuring ECCO Dataset Production to run in
+  local/remote mode" in ../README.md for description.
 
-    - ./data/config/: ECCO Dataset Production configuration file examples.
+## Functionality
 
-    - ./data/ecco_granules/: A selection of representative ECCO Dataset
-      Production results data.
+- edp\_aws\_s3\_sync: Tests various AWS S3 data sync operations:
+  upload, download, and copy within AWS.
 
-    - ./data/ecco_grids/: ECCO grid definitions.
+- edp\_create\_job\_task\_list: Tests creation of task lists based on
+  high-level job definition files.
 
-    - ./data/ecco_results/: Sample ECCO MDS results data.  See also
-      download_selected_data.sh helper script that downloads selected MDS files
-      per the data requirements in edp_create_job_task_list (above).
+## Test data:
 
-Notes
------
+- ./data/config/: ECCO Dataset Production configuration file examples.
 
-	- Tests that rely on AWS S3 access require an AWS account with login privileges.
+- ./data/ecco_grids/: ECCO grid definitions and download script.
+
+- ./data/ecco\_mapping\_factors: ECCO mapping factors (interpolation
+  to latlon grids) and download script.
+
+- ./data/ecco\_results/: Sample ECCO MDS results data.  See also
+  download\_selected\_data.sh helper script that downloads selected
+  MDS files used in edp\_create\_job\_task\_list (above).
+
+## Remarks
+
+- Tests that rely on AWS S3 access require an AWS account with login
+  privileges.
diff --git a/tests/SSH_native_latlon_local/README.md b/tests/SSH_native_latlon_local/README.md
@@ -0,0 +1,18 @@
+
+Complete example illustrating generation of native and latlon format
+granules from local input, with output to local directory.
+
+See "Configuring ECCO Dataset Production to run locally" in top-level
+../../README.md for details.
+
+Steps include:
+
+- Task list generation:
+  `$ ./edp_create_job_task_list_SSH_native_latlon_mon_mean.sh`
+  Output file `SSH_native_latlon_mon_mean_tasks.json` can be compared
+  with saved results in `SSH_native_latlon_mon_mean_tasks.json.sav`
+
+- Granule generation based on generated task list:
+  `$ ./edp_generate_dataproducts_SSH_native_latlon_mon_mean.sh`
+  Resulting granules in `./V4r4` can be compared with saved results in
+  `./V4r4_sav`
diff --git a/tests/SSH_native_latlon_local_remote/README.md b/tests/SSH_native_latlon_local_remote/README.md
@@ -0,0 +1,18 @@
+
+Complete example illustrating generation of native and latlon format
+granules from remote input, with output to local directory.
+
+See "Configuring ECCO Dataset Production to run in local/remote mode"
+in top-level ../../README.md for details.
+
+Steps include:
+
+- Task list generation:
+  `$ ./edp_create_job_task_list_SSH_native_latlon_mon_mean.sh`
+  Output file `SSH_native_latlon_mon_mean_tasks.json` can be compared
+  with saved results in `SSH_native_latlon_mon_mean_tasks.json.sav`
+
+- Granule generation based on generated task list:
+  `$ ./edp_generate_dataproducts_SSH_native_latlon_mon_mean.sh`
+  Resulting granules in `./V4r4` can be compared with saved results in
+  `./V4r4_sav`
diff --git a/tests/data/ecco_grids/README.md b/tests/data/ecco_grids/README.md
@@ -1,6 +1,6 @@
 ## ECCO grid test data.
 
 Although already inluded here for test purposes, the ECCO grids in this directory
-could have also been retrieved vi the included dowload.sh script, e.g.:
+could have also been retrieved via the included dowload.sh script, e.g.:
 
 	$ download.sh -k /usr/local/bin/aws-login.darwin.amd64 -p saml-pub
diff --git a/tests/edp_aws_s3_sync/README.md b/tests/edp_aws_s3_sync/README.md
@@ -1,16 +1,15 @@
-
-AWS S3 sync tests
------------------
+# AWS S3 sync tests
 
 Tests `aws_s3_sync` utility modes: upload (local->remote), download
 (remote->local) and S3 copy (remote->remote).
 
-A circularity test can be performed by running the three test cases in sequence:
-upload, copy within AWS, and download:
+A circularity test can be performed by running the three test cases in
+sequence: upload, copy within AWS, and download:
 
     - `edp_aws_s3_sync_local_remote.sh`
     - `edp_aws_s3_sync_remote_remote.sh`
     - `edp_aws_s3_sync_remote_local.sh'
 
-Note that the above tests require an AWS account with login privileges.
+Note that the above tests require an AWS account with login
+privileges.
 
diff --git a/tests/edp_create_job_task_list/README.md b/tests/edp_create_job_task_list/README.md
@@ -1,23 +1,30 @@
 
-ECCO task list creation tests
------------------------------
+# ECCO task list creation (only) tests
 
 Each of the included tests generates a task list from a jobs description file
 (`jobs_example.txt`), and outputs the resulting list to a corresponding json
 file.
 
-    - `edp_create_job_file_list_test_1.sh`: creates a task list based on
-      locally-stored ECCO results
+- `edp_create_job_file_list_test_1.sh`: creates a task list based on
+  locally-stored ECCO results. Compare with results saved in
+  `edp_create_job_file_list_test_1.json.sav`.
 
-    - `edp_create_job_file_list_test_2.sh`: creates a task list based on limited
-      AWS S3-stored ECCO results (requires a prior data upload; see comments)
+- `edp_create_job_file_list_test_2.sh`: creates a task list based on
+  limited AWS S3-stored ECCO results (requires a prior data upload;
+  see Remarks). Compare with results saved in
+  `edp_create_job_file_list_test_2.json.sav`.
 
-    - `edp_create_job_file_list_test_3.sh`: creates a task list based on full
-      AWS S3-stored ECCO results (requires presence of ECCO results, and
-      retrieving S3 bucket object list may take some time (tens of minutes)
-      depending on network connection speed; see comments)
+- `edp_create_job_file_list_test_3.sh`: creates a task list based on
+  full AWS S3-stored ECCO results (requires presence of ECCO results,
+  and retrieving S3 bucket object list may take some time (tens of
+  minutes) depending on network connection speed; see Remarks).
+  Compare with results saved in
+  `edp_create_job_file_list_test_3.json.sav`.
 
-Note that all examples reference a local config file,
-`product_generation_config.yaml`, and those that reference AWS S3-stored data
-require an AWS account with login privileges.
+## Remarks:
+
+- Note that all examples reference a local config file, (currently
+  `../../processing/configs/product_generation_config_updated.yaml`),
+  and those that reference AWS S3-stored data require an AWS account
+  with login privileges.