From 40e72aaa90e15b3c9714f3ae5b71e40ec1201df9 Mon Sep 17 00:00:00 2001 From: Yongming Ding Date: Thu, 9 Jan 2025 14:43:09 -0800 Subject: [PATCH 1/2] Introduce Multi-Storage Client (MSC) as an optional dependency --- CHANGELOG.md | 1 + examples/multi_storage_client/README.md | 73 +++++++++++++++++++ examples/multi_storage_client/msc_config.yaml | 19 +++++ .../multi_storage_client/requirements.txt | 1 + pyproject.toml | 2 + 5 files changed, 96 insertions(+) create mode 100644 examples/multi_storage_client/README.md create mode 100644 examples/multi_storage_client/msc_config.yaml create mode 100644 examples/multi_storage_client/requirements.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index fe4cc80cf..00c293421 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Dependencies - Remove the numpy dependency upper bound. +- Introduce Multi-Storage Client (MSC) as an optional dependency. ## [0.9.0] - 2024-12-04 diff --git a/examples/multi_storage_client/README.md b/examples/multi_storage_client/README.md new file mode 100644 index 000000000..44a9dc3ca --- /dev/null +++ b/examples/multi_storage_client/README.md @@ -0,0 +1,73 @@ +# Training from Object Storage using Multi-Storage Client + +## What is Multi-Storage Client (MSC)? + +[Multi-Storage Client](https://github.com/NVIDIA/multi-storage-client) is a Python +library that provides a unified interface for accessing various object stores and +file systems. It makes it easy for ML workloads to use object stores by providing +a familiar file-like interface without sacrificing performance. The library adds +new functionality, such as caching, client-side observability, and leverages the native +SDKs specific to each object store for optimal performance. + +## Getting Started + +### Installation + +```bash +pip install -r requirements.txt +``` +Or install different extra dependencies based on your object storage backend: +```bash +# POSIX file systems. +pip install multi-storage-client + +# NVIDIA AIStore. +pip install "multi-storage-client[aistore]" + +# Azure Blob Storage. +pip install "multi-storage-client[azure-storage-blob]" + +# AWS S3 and S3-compatible object stores. +pip install "multi-storage-client[boto3]" + +# Google Cloud Storage (GCS). +pip install "multi-storage-client[google-cloud-storage]" + +# Oracle Cloud Infrastructure (OCI) Object Storage. +pip install "multi-storage-client[oci]" +``` + +### Configuration File + +The MSC configuration file defines profiles which include storage provider configurations. +An example MSC configuration file can be found at [msc_config.yaml](./msc_config.yaml). +In this example, the data is stored in the `cwb-diffusions` bucket in a S3-compatible +object store and credentials are inferred from the environment variables `S3_KEY` and `S3_SECRET`. + +## Update Code Path with MSC + +For Modulus’s use cases, where Zarr is commonly used in training workflows, +migrating to MSC is a straightforward process involving only configuration changes. +For example, in the [Corrdiff](../generative/corrdiff/) training example, data +currently accessed from the file system can be updated to MSC by modifying the +input path from `/code/2023-01-24-cwb-4years.zarr` to `msc://cwb-diffusions/2023-01-24-cwb-4years.zarr`, +with the MSC configuration file defined in [msc_config.yaml](./msc_config.yaml). +This assumes the data stored in the local file has been moved to a S3 bucket `cwb-diffusions`. + +### Current code path (Training from File System): + +```bash +input_path = "/code/2023-01-24-cwb-4years.zarr" +zarr.open_consolidated(input_path) +``` + +### Updated code path (Training from Object Store using MSC): + +```bash +input_path = "msc://cwb-diffusions/2023-01-24-cwb-4years.zarr" +zarr.open_consolidated(input_path) +``` + +## Additional Information + +- [Multi-Storage Client Documentation](https://nvidia.github.io/multi-storage-client/) diff --git a/examples/multi_storage_client/msc_config.yaml b/examples/multi_storage_client/msc_config.yaml new file mode 100644 index 000000000..742fb0193 --- /dev/null +++ b/examples/multi_storage_client/msc_config.yaml @@ -0,0 +1,19 @@ +# This is an example MSC configuration file for accessing the cwb datasets stored +# in an S3-compatible bucket cwb-diffusions. +# The credentials are inferred from the environment variables S3_KEY and S3_SECRET. +profiles: + cwb-diffusions: + storage_provider: + type: s3 + options: + region_name: us-east-1 + endpoint_url: https://pbss.s8k.io + base_path: cwb-diffusions + credentials_provider: + type: S3Credentials + options: + access_key: ${S3_KEY} + secret_key: ${S3_SECRET} + cache: + location: /tmp/.cache + size_mb: 5000 diff --git a/examples/multi_storage_client/requirements.txt b/examples/multi_storage_client/requirements.txt new file mode 100644 index 000000000..0c472bafc --- /dev/null +++ b/examples/multi_storage_client/requirements.txt @@ -0,0 +1 @@ +multi-storage-client[boto3] diff --git a/pyproject.toml b/pyproject.toml index 24d4da7cf..790646b21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,7 @@ dev = [ "interrogate==1.5.0", "coverage==6.5.0", "ruff==0.0.290", + "multi-storage-client>=0.12.2", ] makani = [ @@ -94,6 +95,7 @@ all = [ "nvidia-modulus[dev]", "nvidia-modulus[makani]", "nvidia-modulus[fignet]", + "multi-storage-client[boto3]", ] From 507cbdff42cd30971e723bb3e05eaaef14ef143d Mon Sep 17 00:00:00 2001 From: Yongming Ding Date: Wed, 29 Jan 2025 15:14:15 -0800 Subject: [PATCH 2/2] Address comments --- examples/multi_storage_client/README.md | 43 +++++++++++++++---- examples/multi_storage_client/msc_config.yaml | 24 ++++------- pyproject.toml | 7 ++- 3 files changed, 48 insertions(+), 26 deletions(-) diff --git a/examples/multi_storage_client/README.md b/examples/multi_storage_client/README.md index 44a9dc3ca..00fa346ae 100644 --- a/examples/multi_storage_client/README.md +++ b/examples/multi_storage_client/README.md @@ -1,4 +1,4 @@ -# Training from Object Storage using Multi-Storage Client +# Training from Object Storage using Multi-Storage Client ## What is Multi-Storage Client (MSC)? @@ -16,7 +16,9 @@ SDKs specific to each object store for optimal performance. ```bash pip install -r requirements.txt ``` + Or install different extra dependencies based on your object storage backend: + ```bash # POSIX file systems. pip install multi-storage-client @@ -41,27 +43,50 @@ pip install "multi-storage-client[oci]" The MSC configuration file defines profiles which include storage provider configurations. An example MSC configuration file can be found at [msc_config.yaml](./msc_config.yaml). -In this example, the data is stored in the `cwb-diffusions` bucket in a S3-compatible -object store and credentials are inferred from the environment variables `S3_KEY` and `S3_SECRET`. +In this example, we're pointing to the [CMIP6 archive on AWS](https://registry.opendata.aws/cmip6/). + +## Usage Example + +MSC supports fsspec and integrates with frameworks such as Zarr and Xarray via +the fsspec interface. The following example demonstrates how to use Zarr to +access the CMIP6 dataset stored in AWS S3: + +```bash +export MSC_CONFIG=./msc_config.yaml +python +>>> import zarr +>>> zarr_group = zarr.open("msc://cmip6-pds/CMIP6/ScenarioMIP/NOAA-GFDL/GFDL-ESM4/ssp119/r1i1p1f1/day/tas/gr1/v20180701") +>>> zarr_group.tree() +/ + ├── bnds (2,) float64 + ├── height () float64 + ├── lat (180,) float64 + ├── lat_bnds (180, 2) float64 + ├── lon (288,) float64 + ├── lon_bnds (288, 2) float64 + ├── tas (31390, 180, 288) float32 + ├── time (31390,) int64 + └── time_bnds (31390, 2) float64 +``` -## Update Code Path with MSC +## Update Existing Code Path with MSC -For Modulus’s use cases, where Zarr is commonly used in training workflows, +For other Modulus’s examples, where Zarr is commonly used in training workflows, migrating to MSC is a straightforward process involving only configuration changes. For example, in the [Corrdiff](../generative/corrdiff/) training example, data currently accessed from the file system can be updated to MSC by modifying the input path from `/code/2023-01-24-cwb-4years.zarr` to `msc://cwb-diffusions/2023-01-24-cwb-4years.zarr`, -with the MSC configuration file defined in [msc_config.yaml](./msc_config.yaml). -This assumes the data stored in the local file has been moved to a S3 bucket `cwb-diffusions`. +assuming the data stored in local has been moved to a S3 bucket `cwb-diffusions`, +and MSC has a profile `cwb-diffusions` pointing to this S3 bucket. -### Current code path (Training from File System): +### Current code path (Training from File System) ```bash input_path = "/code/2023-01-24-cwb-4years.zarr" zarr.open_consolidated(input_path) ``` -### Updated code path (Training from Object Store using MSC): +### Updated code path (Training from Object Store using MSC) ```bash input_path = "msc://cwb-diffusions/2023-01-24-cwb-4years.zarr" diff --git a/examples/multi_storage_client/msc_config.yaml b/examples/multi_storage_client/msc_config.yaml index 742fb0193..74042eea2 100644 --- a/examples/multi_storage_client/msc_config.yaml +++ b/examples/multi_storage_client/msc_config.yaml @@ -1,19 +1,13 @@ -# This is an example MSC configuration file for accessing the cwb datasets stored -# in an S3-compatible bucket cwb-diffusions. -# The credentials are inferred from the environment variables S3_KEY and S3_SECRET. +# This is an example MSC configuration file for accessing the CMIP6 archive on AWS: +# https://registry.opendata.aws/cmip6/ profiles: - cwb-diffusions: + cmip6-pds: storage_provider: type: s3 options: - region_name: us-east-1 - endpoint_url: https://pbss.s8k.io - base_path: cwb-diffusions - credentials_provider: - type: S3Credentials - options: - access_key: ${S3_KEY} - secret_key: ${S3_SECRET} - cache: - location: /tmp/.cache - size_mb: 5000 + region_name: us-west-2 + base_path: cmip6-pds + signature_version: UNSIGNED +cache: + location: /tmp/.cache + size_mb: 5000 diff --git a/pyproject.toml b/pyproject.toml index 790646b21..8a086d88a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,6 @@ dev = [ "interrogate==1.5.0", "coverage==6.5.0", "ruff==0.0.290", - "multi-storage-client>=0.12.2", ] makani = [ @@ -78,6 +77,10 @@ fignet = [ "webdataset>=0.2", ] +storage = [ + "multi-storage-client>=0.14.0", +] + all = [ "h5py>=3.7.0", "netcdf4>=1.6.3", @@ -95,7 +98,7 @@ all = [ "nvidia-modulus[dev]", "nvidia-modulus[makani]", "nvidia-modulus[fignet]", - "multi-storage-client[boto3]", + "nvidia-modulus[storage]", ]