diff --git a/Configuration.md b/Configuration.md index ef30047f..13b4f9c7 100644 --- a/Configuration.md +++ b/Configuration.md @@ -76,12 +76,6 @@ These describe which data source to download, where the data should live, and ho * `dataset`: (optional) Name of the target dataset. Allowed options are dictated by the client. * `target_path`: (required) Download artifact filename template. Can use Python string format symbols. Must have the same number of format symbols as the number of partition keys. -* `target_filename`: (optional) This file name will be appended to `target_path`. - * Like `target_path`, `target_filename` can contain format symbols to be replaced by partition keys; if this is - used, the total number of format symbols in both fields must match the number of partition keys. - * This field is required when generating a date-based directory hierarchy (see below). -* `append_date_dirs`: (optional) A boolean indicating whether a date-based directory hierarchy should be created (see - below); defaults to false if not used. * `partition_keys`: (optional) This determines how download jobs will be divided. * Value can be a single item or a list. * Each value must appear as a key in the `selection` section. @@ -91,29 +85,23 @@ These describe which data source to download, where the data should live, and ho * E.g. `['year', 'month']` will lead to a config set like `[(2015, 01), (2015, 02), (2015, 03), ...]`. * The list of keys will be used to format the `target_path`. -### Creating a date-based directory hierarchy - -The configuration can be set up to automatically generate a date-based directory hierarchy for the output files. +> **NOTE**: `target_path` template is totally compatible with Python's standard string formatting. +> This includes being able to use named arguments (e.g. 'gs://bucket/{year}/{month}/{day}.nc') as well as specifying formats for strings +> (e.g. 'gs://bucket/{year:04d}/{month:02d}/{day:02d}.nc'). -To enable this feature, the `append_date_dirs` field has to be set to `true`. In addition, the `target_filename` needs -to be specified, and `date` has to be a `partition_key`; -`date` will not be used as a replacement in `target_template` but will instead be used to create a directory structure. +### Creating a date-based directory hierarchy -The resulting target path will be `/{year}/{month}/{day}`. The number of format symbols in -this path has to match the number of partition keys excluding `date`. +The date-based directory hierarchy can be created using Python's standard string formatting. +Below are some examples of how to use `target_path` with Python's standard string formatting.
Examples -Below are more examples of how to use `target_path`, `target_filename`, and `append_date_dirs`. - Note that any parameters that are not relevant to the target path have been omitted. ``` [parameters] -target_filename=.nc -target_path=gs://ecmwf-output-test/era5/ -append_date_dirs=true +target_path=gs://ecmwf-output-test/era5/{date:%%Y/%%m/%%d}.nc partition_keys= date [selection] @@ -126,9 +114,7 @@ will create ``` [parameters] -target_filename=-pressure-{}.nc -target_path=gs://ecmwf-output-test/era5/ -append_date_dirs=true +target_path=gs://ecmwf-output-test/era5/{date:%%Y/%%m/%%d}-pressure-{pressure_level}.nc partition_keys= date pressure_level @@ -144,9 +130,7 @@ will create ``` [parameters] -target_filename=.nc -target_path=gs://ecmwf-output-test/pressure-{}/era5/ -append_date_dirs=true +target_path=gs://ecmwf-output-test/pressure-{pressure_level}/era5/{date:%%Y/%%m/%%d}.nc partition_keys= date pressure_level @@ -160,12 +144,9 @@ will create `gs://ecmwf-output-test/pressure-500/era5/2017/01/01.nc` and `gs://ecmwf-output-test/pressure-500/era5/2017/01/02.nc`. -The above example also illustrates how to create a directory structure based on partition keys, even without using the -date-based creation: - ``` [parameters] -target_path=gs://ecmwf-output-test/era5/{}/{}/{}-pressure-{}.nc +target_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc partition_keys= year month @@ -187,6 +168,14 @@ will create `gs://ecmwf-output-test/era5/2017/01/01-pressure-500.nc` and `gs://ecmwf-output-test/era5/2017/01/02-pressure-500.nc`. +> **Note**: Replacing the `target_path` of the above example with this `target_path=gs://ecmwf-output-test/era5/{year}/{month}/{day}-pressure- +>{pressure_level}.nc` +> +> will create +> +> `gs://ecmwf-output-test/era5/2017/1/1-pressure-500.nc` and +> `gs://ecmwf-output-test/era5/2017/1/2-pressure-500.nc`. +
### Subsections diff --git a/configs/era5_example_config.cfg b/configs/era5_example_config.cfg index d7cf35b0..89936fab 100644 --- a/configs/era5_example_config.cfg +++ b/configs/era5_example_config.cfg @@ -14,7 +14,7 @@ [parameters] client=cds dataset=reanalysis-era5-pressure-levels -target_path=gs://ecmwf-output-test/era5/{}/{}/{}-pressure-{}.nc +target_path=gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}-pressure-{pressure_level}.nc partition_keys= year month diff --git a/configs/era5_example_config_local_run.cfg b/configs/era5_example_config_local_run.cfg index b3415fc1..95db6287 100644 --- a/configs/era5_example_config_local_run.cfg +++ b/configs/era5_example_config_local_run.cfg @@ -22,7 +22,7 @@ [parameters] client=cds dataset=reanalysis-era5-pressure-levels -target_path=era5-{}{}{}-pressure-{}.nc +target_path=era5-{year:04d}{month:02d}{day:02d}-pressure-{pressure_level}.nc partition_keys= year month diff --git a/configs/era5_example_config_preproc.cfg b/configs/era5_example_config_preproc.cfg index 63c221f2..c143c70a 100644 --- a/configs/era5_example_config_preproc.cfg +++ b/configs/era5_example_config_preproc.cfg @@ -14,7 +14,7 @@ [parameters] client=cds dataset=reanalysis-era5-pressure-levels -target_path=gs://ecmwf-downloads/test/o1280-{}-{}-{}.grib +target_path=gs://ecmwf-downloads/test/o1280-{year:04d}-{month:02d}-{day:02d}.grib partition_keys= year month diff --git a/configs/era5_example_config_using_date.cfg b/configs/era5_example_config_using_date.cfg index 581102ac..223015e0 100644 --- a/configs/era5_example_config_using_date.cfg +++ b/configs/era5_example_config_using_date.cfg @@ -21,9 +21,7 @@ dataset=reanalysis-era5-pressure-levels # gs://ecmwf-output-test/era5/2017/01/02-pressure-500.nc # gs://ecmwf-output-test/era5/2017/01/01-pressure-1000.nc # gs://ecmwf-output-test/era5/2017/01/02-pressure-1000.nc -target_filename=-pressure-{}.nc -target_path=gs://ecmwf-output-test/era5/ -append_date_dirs=true +target_path=gs://ecmwf-output-test/era5/{date:%%Y/%%m/%%d}-pressure-{pressure_level}.nc partition_keys= date pressure_level diff --git a/configs/mars_example_config.cfg b/configs/mars_example_config.cfg index 8450309d..725dcdac 100644 --- a/configs/mars_example_config.cfg +++ b/configs/mars_example_config.cfg @@ -15,9 +15,7 @@ [parameters] client=mars dataset=ecmwf-mars-output -target_filename=.nc -target_path=gs://ecmwf-downloads/hres-single-level -append_date_dirs=true +target_path=gs://ecmwf-downloads/hres-single-level/{date:%%Y/%%m/%%d}.nc partition_keys= date diff --git a/configs/mars_example_config.json b/configs/mars_example_config.json index 28041677..a7677988 100644 --- a/configs/mars_example_config.json +++ b/configs/mars_example_config.json @@ -2,9 +2,7 @@ "parameters": { "client": "mars", "dataset": "ecmwf-mars-output", - "target_filename": ".nc", - "target_path": "gs://ecmwf-downloads/hres-single-level", - "append_date_dirs": "true", + "target_path": "gs://ecmwf-downloads/hres-single-level/{:%Y/%m/%d}.nc", "partition_keys": "date" }, diff --git a/configs/seasonal_forecast_example_config.cfg b/configs/seasonal_forecast_example_config.cfg index f58c47bb..d1a8d76c 100644 --- a/configs/seasonal_forecast_example_config.cfg +++ b/configs/seasonal_forecast_example_config.cfg @@ -15,7 +15,7 @@ [parameters] client=cds dataset=seasonal-original-single-levels -target_path=gs://ecmwf-output-test/seasonal-forecast/seasonal-forecast-{}-{}.nc +target_path=gs://ecmwf-output-test/seasonal-forecast/seasonal-forecast-{year:04d}-{month:02d}.nc partition_keys= year month diff --git a/weather_dl/download_pipeline/fetcher_test.py b/weather_dl/download_pipeline/fetcher_test.py index afed07ee..74bc7e1b 100644 --- a/weather_dl/download_pipeline/fetcher_test.py +++ b/weather_dl/download_pipeline/fetcher_test.py @@ -73,7 +73,7 @@ def test_fetch_data(self, mock_retrieve, mock_gcs_file): 'parameters': { 'dataset': 'reanalysis-era5-pressure-levels', 'partition_keys': ['year', 'month'], - 'target_path': 'gs://weather-dl-unittest/download-{}-{}.nc', + 'target_path': 'gs://weather-dl-unittest/download-{:02d}-{:02d}.nc', 'api_url': 'https//api-url.com/v1/', 'api_key': '12345', }, @@ -104,7 +104,7 @@ def test_fetch_data__manifest__returns_success(self, mock_retrieve, mock_gcs_fil 'parameters': { 'dataset': 'reanalysis-era5-pressure-levels', 'partition_keys': ['year', 'month'], - 'target_path': 'gs://weather-dl-unittest/download-{}-{}.nc', + 'target_path': 'gs://weather-dl-unittest/download-{:02d}-{:02d}.nc', 'api_url': 'https//api-url.com/v1/', 'api_key': '12345', }, @@ -132,7 +132,7 @@ def test_fetch_data__manifest__records_retrieve_failure(self, mock_retrieve): 'parameters': { 'dataset': 'reanalysis-era5-pressure-levels', 'partition_keys': ['year', 'month'], - 'target_path': 'gs://weather-dl-unittest/download-{}-{}.nc', + 'target_path': 'gs://weather-dl-unittest/download-{:02d}-{:02d}.nc', 'api_url': 'https//api-url.com/v1/', 'api_key': '12345', }, @@ -169,7 +169,7 @@ def test_fetch_data__manifest__records_gcs_failure(self, mock_retrieve, mock_gcs 'parameters': { 'dataset': 'reanalysis-era5-pressure-levels', 'partition_keys': ['year', 'month'], - 'target_path': 'gs://weather-dl-unittest/download-{}-{}.nc', + 'target_path': 'gs://weather-dl-unittest/download-{:02d}-{:02d}.nc', 'api_url': 'https//api-url.com/v1/', 'api_key': '12345', }, @@ -205,7 +205,7 @@ def test_fetch_data__skips_existing_download(self, mock_retrieve, mock_gcs_file) 'parameters': { 'dataset': 'reanalysis-era5-pressure-levels', 'partition_keys': ['year', 'month'], - 'target_path': 'gs://weather-dl-unittest/download-{}-{}.nc', + 'target_path': 'gs://weather-dl-unittest/download-{year:02d}-{month:02d}.nc', 'api_url': 'https//api-url.com/v1/', 'api_key': '12345', }, diff --git a/weather_dl/download_pipeline/parsers.py b/weather_dl/download_pipeline/parsers.py index 90c169a1..dd29bac0 100644 --- a/weather_dl/download_pipeline/parsers.py +++ b/weather_dl/download_pipeline/parsers.py @@ -17,12 +17,12 @@ import copy as cp import datetime import json -import os +import ast import string import textwrap import typing as t from urllib.parse import urlparse - +from collections import OrderedDict from .clients import CLIENTS from .manifest import MANIFESTS, Manifest, Location, NoOpManifest @@ -69,6 +69,81 @@ def date(candidate: str) -> datetime.date: return converted +def time(candidate: str) -> datetime.time: + """Converts ECMWF-format time strings into a `datetime.time`. + + Accepted time formats: + - HH:MM + - HHMM + - HH + + For example: + - 18:00 + - 1820 + - 18 + + Note: If MM is omitted it defaults to 00. + """ + converted = None + + accepted_formats = ["%H", "%H:%M", "%H%M"] + + for fmt in accepted_formats: + try: + converted = datetime.datetime.strptime(candidate, fmt).time() + break + except ValueError: + pass + + if converted is None: + raise ValueError( + f"Not a valid time: '{candidate}'. Please use valid format." + ) + + return converted + + +def day_month_year(candidate: t.Any) -> int: + """Converts day, month and year strings into 'int'.""" + try: + if isinstance(candidate, str) or isinstance(candidate, int): + return int(candidate) + raise + except ValueError as e: + raise ValueError( + f"Not a valid day, month, or year value: {candidate}. Please use valid value." + ) from e + + +def parse_literal(candidate: t.Any) -> t.Any: + try: + return ast.literal_eval(candidate) + except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError): + return candidate + + +def validate(key: str, value: int) -> None: + """Validates value based on the key.""" + if key == "day": + assert 1 <= value <= 31, "Day value must be between 1 to 31." + if key == "month": + assert 1 <= value <= 12, "Month value must be between 1 to 12." + + +def typecast(key: str, value: t.Any) -> t.Any: + """Type the value to its appropriate datatype.""" + SWITCHER = { + 'date': date, + 'time': time, + 'day': day_month_year, + 'month': day_month_year, + 'year': day_month_year, + } + converted = SWITCHER.get(key, parse_literal)(value) + validate(key, converted) + return converted + + def parse_config(file: t.IO) -> Config: """Parses a `*.json` or `*.cfg` file into a configuration dictionary.""" try: @@ -210,7 +285,7 @@ def _parse_lists(config_parser: configparser.ConfigParser, section: str = '') -> config = dict(config_parser.items(section)) for key, val in config.items(): - if '/' in val and 'parameters' not in section: + if ('/' in val or key == 'date') and 'parameters' not in section: config[key] = parse_mars_syntax(val) elif '\n' in val: config[key] = _splitlines(val) @@ -222,13 +297,6 @@ def _number_of_replacements(s: t.Text): return len([v for v in string.Formatter().parse(s) if v[1] is not None]) -def use_date_as_directory(config: t.Dict): - return 'partition_keys' in config['parameters'] \ - and 'date' in config['parameters']['partition_keys'] \ - and config['parameters'].get('append_date_dirs', 'false') == 'true' \ - and 'target_filename' in config['parameters'] - - def parse_subsections(config: t.Dict) -> t.Dict: """Interprets [section.subsection] as nested dictionaries in `.cfg` files. @@ -262,10 +330,10 @@ def process_config(file: t.IO) -> Config: """Read the config file and prompt the user if it is improperly structured.""" config = parse_config(file) - def require(condition: bool, message: str) -> None: - """A assert-like helper that wraps text and throws a `ValueError`.""" + def require(condition: bool, message: str, error_type: t.Type[Exception] = ValueError) -> None: + """A assert-like helper that wraps text and throws an error.""" if not condition: - raise ValueError(textwrap.dedent(message)) + raise error_type(textwrap.dedent(message)) require(bool(config), "Unable to parse configuration file.") require('parameters' in config, @@ -303,12 +371,21 @@ def require(condition: bool, message: str) -> None: Supported clients are {} """.format(str(list(CLIENTS.keys())))) - if params.get('append_date_dirs', 'false') == 'true': - require(use_date_as_directory(config), - """ - 'append_date_dirs' set to true, but creating the date directory hierarchy also - requires that 'target_filename' is given and that 'date' is a partition_key. - """) + require('append_date_dirs' not in params, + """ + The current version of 'google-weather-tools' no longer supports 'append_date_dirs'! + + Please refer to documentation for creating date-based directory hierarchy : + https://weather-tools.readthedocs.io/en/latest/Configuration.html#""" + """creating-a-date-based-directory-hierarchy.""", + NotImplementedError) + require('target_filename' not in params, + """ + The current version of 'google-weather-tools' no longer supports 'target_filename'! + + Please refer to documentation : + https://weather-tools.readthedocs.io/en/latest/Configuration.html#parameters-section.""", + NotImplementedError) partition_keys = params.get('partition_keys', list()) if isinstance(partition_keys, str): @@ -323,22 +400,12 @@ def require(condition: bool, message: str) -> None: documentation for more information.""") num_template_replacements = _number_of_replacements(params['target_path']) - if 'target_filename' in params: - num_template_replacements += _number_of_replacements(params['target_filename']) num_partition_keys = len(partition_keys) - if use_date_as_directory(config): - num_partition_keys -= 1 - target_path = t.cast(str, params.get('target_path', '')) - if target_path != '': - params['target_path'] = target_path.rstrip('/') require(num_template_replacements == num_partition_keys, """ 'target_path' has {0} replacements. Expected {1}, since there are {1} partition keys. - - Note: If date is used to create a directory hierarchy - no replacement is needed for 'date') """.format(num_template_replacements, num_partition_keys)) # Ensure consistent lookup. @@ -350,22 +417,12 @@ def require(condition: bool, message: str) -> None: def prepare_target_name(config: Config) -> str: """Returns name of target location.""" parameters = config['parameters'] - target_path = t.cast(str, parameters.get('target_path', '')) - target_filename = t.cast(str, parameters.get('target_filename', '')) partition_keys = t.cast(t.List[str], cp.copy(parameters.get('partition_keys', list()))) - if use_date_as_directory(config): - date = t.cast(str, config['selection']['date'][0]) - date_vals = date.split('-') - target_path = os.path.join(target_path, *date_vals) - partition_keys.remove('date') - - target_path += target_filename - - partition_key_values = [config['selection'][key][0] for key in partition_keys] - target = target_path.format(*partition_key_values) + partition_dict = OrderedDict((key, typecast(key, config['selection'][key][0])) for key in partition_keys) + target = target_path.format(*partition_dict.values(), **partition_dict) return target diff --git a/weather_dl/download_pipeline/parsers_test.py b/weather_dl/download_pipeline/parsers_test.py index 9796a684..d6f40373 100644 --- a/weather_dl/download_pipeline/parsers_test.py +++ b/weather_dl/download_pipeline/parsers_test.py @@ -680,15 +680,14 @@ def test_mismatched_template_partition_keys(self): "'target_path' has 1 replacements. Expected 2", ctx.exception.args[0]) - def test_date_as_directory_key_mismatch(self): - with self.assertRaises(ValueError) as ctx: + def test_append_date_dirs_raise_error(self): + with self.assertRaises(NotImplementedError) as ctx: with io.StringIO( """ [parameters] dataset=foo client=cds - target_path=somewhere/ - target_filename=bar-{} + target_path=somewhere/bar-{} append_date_dirs=true partition_keys= date @@ -699,18 +698,21 @@ def test_date_as_directory_key_mismatch(self): process_config(f) self.assertIn( - "'target_path' has 1 replacements. Expected 0", + "The current version of 'google-weather-tools' no longer supports 'append_date_dirs'!" + "\n\nPlease refer to documentation for creating date-based directory hierarchy :\n" + "https://weather-tools.readthedocs.io/en/latest/Configuration.html" + "#creating-a-date-based-directory-hierarchy.", ctx.exception.args[0]) - def test_append_date_dirs_without_filename(self): - with self.assertRaises(ValueError) as ctx: + def test_target_filename_raise_error(self): + with self.assertRaises(NotImplementedError) as ctx: with io.StringIO( """ [parameters] dataset=foo client=cds target_path=somewhere/ - append_date_dirs=true + target_filename=bar-{} partition_keys= date [selection] @@ -720,69 +722,11 @@ def test_append_date_dirs_without_filename(self): process_config(f) self.assertIn( - "'append_date_dirs' set to true, but creating the date directory hierarchy", - ctx.exception.args[0]) - - def test_append_date_dirs_without_date_partition(self): - with self.assertRaises(ValueError) as ctx: - with io.StringIO( - """ - [parameters] - dataset=foo - client=cds - target_path=somewhere/ - target_filename=bar - append_date_dirs=true - partition_keys= - pressure - [selection] - pressure=500 - """ - ) as f: - process_config(f) - - self.assertIn( - "'append_date_dirs' set to true, but creating the date directory hierarchy", - ctx.exception.args[0]) - - def test_append_date_dirs_without_partition_keys(self): - with self.assertRaises(ValueError) as ctx: - with io.StringIO( - """ - [parameters] - dataset=foo - client=cds - target_path=somewhere/ - target_filename=bar - append_date_dirs=true - [selection] - pressure=500 - """ - ) as f: - process_config(f) - - self.assertIn( - "'append_date_dirs' set to true, but creating the date directory hierarchy", + "The current version of 'google-weather-tools' no longer supports 'target_filename'!" + "\n\nPlease refer to documentation :\n" + "https://weather-tools.readthedocs.io/en/latest/Configuration.html#parameters-section.", ctx.exception.args[0]) - def test_date_as_directory_target_directory_ends_in_slash(self): - with io.StringIO( - """ - [parameters] - dataset=foo - client=cds - target_path=somewhere/ - target_filename=bar - append_date_dirs=true - partition_keys= - date - [selection] - date=2017-01-01/to/2017-01-01 - """ - ) as f: - config = process_config(f) - self.assertEqual(config['parameters']['target_path'], "somewhere") - def test_client_not_set(self): with self.assertRaises(ValueError) as ctx: with io.StringIO( @@ -840,7 +784,7 @@ class PrepareTargetNameTest(unittest.TestCase): 'year': ['02'] } }, - expected='download-02-12.nc'), + expected='download-2-12.nc'), dict(case='Has date but no target directory.', config={ 'parameters': { @@ -857,9 +801,8 @@ class PrepareTargetNameTest(unittest.TestCase): dict(case='Has Directory, but no date', config={ 'parameters': { - 'target_path': 'somewhere/', + 'target_path': 'somewhere/download/{:02d}/{:02d}.nc', 'partition_keys': ['year', 'month'], - 'target_filename': 'download/{}/{}.nc', 'force_download': False }, 'selection': { @@ -873,9 +816,7 @@ class PrepareTargetNameTest(unittest.TestCase): config={ 'parameters': { 'partition_keys': ['date'], - 'target_path': 'somewhere', - 'target_filename': '-download.nc', - 'append_date_dirs': 'true', + 'target_path': 'somewhere/{date:%Y/%m/%d}-download.nc', 'force_download': False }, 'selection': { @@ -887,9 +828,7 @@ class PrepareTargetNameTest(unittest.TestCase): config={ 'parameters': { 'partition_keys': ['date', 'pressure_level'], - 'target_path': 'somewhere', - 'target_filename': '-pressure-{}.nc', - 'append_date_dirs': 'true', + 'target_path': 'somewhere/{date:%Y/%m/%d}-pressure-{pressure_level}.nc', 'force_download': False }, 'selection': { @@ -903,9 +842,7 @@ class PrepareTargetNameTest(unittest.TestCase): config={ 'parameters': { 'partition_keys': ['date', 'expver', 'pressure_level'], - 'target_path': 'somewhere/expver-{}', - 'target_filename': '-pressure-{}.nc', - 'append_date_dirs': 'true', + 'target_path': 'somewhere/expver-{expver}/{date:%Y/%m/%d}-pressure-{pressure_level}.nc', 'force_download': False }, 'selection': { diff --git a/weather_dl/download_pipeline/pipeline_test.py b/weather_dl/download_pipeline/pipeline_test.py index 1191e703..32487600 100644 --- a/weather_dl/download_pipeline/pipeline_test.py +++ b/weather_dl/download_pipeline/pipeline_test.py @@ -38,7 +38,8 @@ config={ 'parameters': {'client': 'cds', 'dataset': 'reanalysis-era5-pressure-levels', - 'target_path': 'gs://ecmwf-output-test/era5/{}/{}/{}-pressure-{}.nc', + 'target_path': 'gs://ecmwf-output-test/era5/{year:04d}/{month:02d}/{day:02d}' + '-pressure-{pressure_level}.nc', 'partition_keys': ['year', 'month', 'day', 'pressure_level'], 'force_download': False, 'user_id': getpass.getuser()}, diff --git a/weather_dl/setup.py b/weather_dl/setup.py index aaa11f82..9f1ba96e 100644 --- a/weather_dl/setup.py +++ b/weather_dl/setup.py @@ -32,7 +32,7 @@ setup( name='download_pipeline', packages=find_packages(), - version='0.1.3', + version='0.1.4', author='Anthromets', author_email='anthromets-ecmwf@google.com', url='https://weather-tools.readthedocs.io/en/latest/weather_dl/',