Skip to content

Commit

Permalink
Expand Variables class to read s3 urls (#464)
Browse files Browse the repository at this point in the history
* expand extract_product and extract_version to check for s3 url

* add cloud notes to variables notebook

---------

Co-authored-by: Jessica Scheick <[email protected]>
  • Loading branch information
rwegener2 and JessicaS11 committed Jan 5, 2024
1 parent 93b0f1a commit 8d7db9c
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 51 deletions.
13 changes: 8 additions & 5 deletions doc/source/example_notebooks/IS2_data_variables.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"\n",
"A given ICESat-2 product may have over 200 variable + path combinations.\n",
"icepyx includes a custom `Variables` module that is \"aware\" of the ATLAS sensor and how the ICESat-2 data products are stored.\n",
"The module can be accessed independently, and can also be accessed as a component of a `Query` object or `Read` object.\n",
"The module can be accessed independently and can also be accessed as a component of a `Query` object or `Read` object.\n",
"\n",
"This notebook illustrates in detail how the `Variables` module behaves. We use the module independently and also show how powerful it is directly in the icepyx workflow using a `Query` data access example.\n",
"Module usage using `Query` is analogous through an icepyx ICESat-2 `Read` object.\n",
Expand Down Expand Up @@ -75,7 +75,7 @@
"There are three ways to create or access an ICESat-2 Variables object in icepyx:\n",
"1. Access via the `.order_vars` property of a Query object\n",
"2. Access via the `.vars` property of a Read object\n",
"3. Create a stand-alone ICESat-2 Variables object using a local file or a product name\n",
"3. Create a stand-alone ICESat-2 Variables object using a local file, cloud file, or a product name\n",
"\n",
"An example of each of these is shown below."
]
Expand Down Expand Up @@ -180,8 +180,11 @@
"### 3. Create a stand-alone Variables object\n",
"\n",
"You can also generate an independent Variables object. This can be done using either:\n",
"1. The filepath to a file you'd like a variables list for\n",
"2. The product name (and optionally version) of a an ICESat-2 product"
"1. The filepath to a local or cloud file you'd like a variables list for\n",
"2. The product name (and optionally version) of a an ICESat-2 product\n",
"\n",
"*Note: Cloud data access requires a valid Earthdata login; \n",
"you will be prompted to log in if you are not already authenticated.*"
]
},
{
Expand Down Expand Up @@ -255,7 +258,7 @@
},
"outputs": [],
"source": [
"v = ipx.Variables(product='ATL03', version='004')"
"v = ipx.Variables(product='ATL03', version='006')"
]
},
{
Expand Down
110 changes: 78 additions & 32 deletions icepyx/core/is2ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
import warnings
from xml.etree import ElementTree as ET

import earthaccess

import icepyx

# ICESat-2 specific reference functions
# options to get customization options for ICESat-2 data (though could be used generally)


def _validate_product(product):
Expand Down Expand Up @@ -48,9 +47,6 @@ def _validate_product(product):
return product


# DevGoal: See if there's a way to dynamically get this list so it's automatically updated


def _validate_OA_product(product):
"""
Confirm a valid ICESat-2 product was specified
Expand Down Expand Up @@ -87,6 +83,7 @@ def about_product(prod):


# DevGoal: use a mock of this output to test later functions, such as displaying options and widgets, etc.
# options to get customization options for ICESat-2 data (though could be used generally)
def _get_custom_options(session, product, version):
"""
Get lists of what customization options are available for the product from NSIDC.
Expand Down Expand Up @@ -330,6 +327,7 @@ def gt2spot(gt, sc_orient):

return np.uint8(spot)


def latest_version(product):
"""
Determine the most recent version available for the given product.
Expand All @@ -340,38 +338,86 @@ def latest_version(product):
'006'
"""
_about_product = about_product(product)
return max(
[entry["version_id"] for entry in _about_product["feed"]["entry"]]
)
return max([entry["version_id"] for entry in _about_product["feed"]["entry"]])

def extract_product(filepath):

def extract_product(filepath, auth=None):
"""
Read the product type from the metadata of the file. Return the product as a string.
Read the product type from the metadata of the file. Valid for local or s3 files, but must
provide an auth object if reading from s3. Return the product as a string.
Parameters
----------
filepath: string
local or remote location of a file. Could be a local string or an s3 filepath
auth: earthaccess.auth.Auth, default None
An earthaccess authentication object. Optional, but necessary if accessing data in an
s3 bucket.
"""
with h5py.File(filepath, 'r') as f:
try:
product = f.attrs['short_name']
if isinstance(product, bytes):
# For most products the short name is stored in a bytes string
product = product.decode()
elif isinstance(product, np.ndarray):
# ATL14 saves the short_name as an array ['ATL14']
product = product[0]
product = _validate_product(product)
except KeyError:
raise 'Unable to parse the product name from file metadata'
# Generate a file reader object relevant for the file location
if filepath.startswith("s3"):
if not auth:
raise AttributeError(
"Must provide credentials to `auth` if accessing s3 data"
)
# Read the s3 file
s3 = earthaccess.get_s3fs_session(daac="NSIDC", provider=auth)
f = h5py.File(s3.open(filepath, "rb"))
else:
# Otherwise assume a local filepath. Read with h5py.
f = h5py.File(filepath, "r")

# Extract the product information
try:
product = f.attrs["short_name"]
if isinstance(product, bytes):
# For most products the short name is stored in a bytes string
product = product.decode()
elif isinstance(product, np.ndarray):
# ATL14 saves the short_name as an array ['ATL14']
product = product[0]
product = _validate_product(product)
except KeyError:
raise "Unable to parse the product name from file metadata"
# Close the file reader
f.close()
return product

def extract_version(filepath):

def extract_version(filepath, auth=None):
"""
Read the version from the metadata of the file. Return the version as a string.
Read the version from the metadata of the file. Valid for local or s3 files, but must
provide an auth object if reading from s3. Return the version as a string.
Parameters
----------
filepath: string
local or remote location of a file. Could be a local string or an s3 filepath
auth: earthaccess.auth.Auth, default None
An earthaccess authentication object. Optional, but necessary if accessing data in an
s3 bucket.
"""
with h5py.File(filepath, 'r') as f:
try:
version = f['METADATA']['DatasetIdentification'].attrs['VersionID']
if isinstance(version, np.ndarray):
# ATL14 stores the version as an array ['00x']
version = version[0]
except KeyError:
raise 'Unable to parse the version from file metadata'
# Generate a file reader object relevant for the file location
if filepath.startswith("s3"):
if not auth:
raise AttributeError(
"Must provide credentials to `auth` if accessing s3 data"
)
# Read the s3 file
s3 = earthaccess.get_s3fs_session(daac="NSIDC", provider=auth)
f = h5py.File(s3.open(filepath, "rb"))
else:
# Otherwise assume a local filepath. Read with h5py.
f = h5py.File(filepath, "r")

# Read the version information
try:
version = f["METADATA"]["DatasetIdentification"].attrs["VersionID"]
if isinstance(version, np.ndarray):
# ATL14 stores the version as an array ['00x']
version = version[0]
except KeyError:
raise "Unable to parse the version from file metadata"
# Close the file reader
f.close()
return version
4 changes: 4 additions & 0 deletions icepyx/core/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,10 @@ class Query(GenQuery, EarthdataAuthMixin):
reference ground tracks are used. Example: "0594"
files : string, default None
A placeholder for future development. Not used for any purposes yet.
auth : earthaccess.auth.Auth, default None
An earthaccess authentication object. Available as an argument so an existing
earthaccess.auth.Auth object can be used for authentication. If not given, a new auth
object will be created whenever authentication is needed.
Returns
-------
Expand Down
13 changes: 13 additions & 0 deletions icepyx/core/validate_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,16 @@ def tracks(track):
warnings.warn("Listed Reference Ground Track is not available")

return track_list

def check_s3bucket(path):
"""
Check if the given path is an s3 path. Raise a warning if the data being referenced is not
in the NSIDC bucket
"""
split_path = path.split('/')
if split_path[0] == 's3:' and split_path[2] != 'nsidc-cumulus-prod-protected':
warnings.warn(
's3 data being read from outside the NSIDC data bucket. Icepyx can '
'read this data, but available data lists may not be accurate.', stacklevel=2
)
return path
33 changes: 19 additions & 14 deletions icepyx/core/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,10 @@ class Variables(EarthdataAuthMixin):
Dictionary (key:values) of available variable names (keys) and paths (values).
wanted : dictionary, default None
As avail, but for the desired list of variables
session : requests.session object
A session object authenticating the user to download data using their Earthdata login information.
The session object will automatically be passed from the query object if you
have successfully logged in there.
auth : earthaccess.auth.Auth, default None
An earthaccess authentication object. Available as an argument so an existing
earthaccess.auth.Auth object can be used for authentication. If not given, a new auth
object will be created whenever authentication is needed.
"""

def __init__(
Expand All @@ -75,27 +74,33 @@ def __init__(

if path and product:
raise TypeError(
'Please provide either a filepath or a product. If a filepath is provided ',
'Please provide either a path or a product. If a path is provided ',
'variables will be read from the file. If a product is provided all available ',
'variables for that product will be returned.'
)

# initialize authentication properties
EarthdataAuthMixin.__init__(self, auth=auth)

# Set the product and version from either the input args or the file
if path:
self._path = path
self._product = is2ref.extract_product(self._path)
self._version = is2ref.extract_version(self._path)
self._path = val.check_s3bucket(path)
# Set up auth
if self._path.startswith('s3'):
auth = self.auth
else:
auth = None
# Read the product and version from the file
self._product = is2ref.extract_product(self._path, auth=auth)
self._version = is2ref.extract_version(self._path, auth=auth)
elif product:
# Check for valid product string
self._product = is2ref._validate_product(product)
# Check for valid version string
# If version is not specified by the user assume the most recent version
self._version = val.prod_version(is2ref.latest_version(self._product), version)
else:
raise TypeError('Either a filepath or a product need to be given as input arguments.')

# initialize authentication properties
EarthdataAuthMixin.__init__(self, auth=auth)
raise TypeError('Either a path or a product need to be given as input arguments.')

self._avail = avail
self.wanted = wanted
Expand Down Expand Up @@ -138,7 +143,7 @@ def avail(self, options=False, internal=False):
"""

if not hasattr(self, "_avail") or self._avail == None:
if not hasattr(self, 'path'):
if not hasattr(self, 'path') or self.path.startswith('s3'):
self._avail = is2ref._get_custom_options(
self.session, self.product, self.version
)["variables"]
Expand Down

0 comments on commit 8d7db9c

Please sign in to comment.