Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Foundry client and server #61

Open
wants to merge 40 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
4c81440
Add files for Foundry client and server
wesselb Jan 8, 2025
1176d6f
Fix Docker requirements
wesselb Jan 8, 2025
fe534cf
add swagger3.json
temporaer Jan 8, 2025
18efcdc
place swagger correctly, incorporate feedback
temporaer Jan 8, 2025
deb9955
minor
temporaer Jan 8, 2025
fb30897
reorganize Dockerfile for faster iteration
temporaer Jan 8, 2025
5ffd9f3
dispatch based on HTTP method
temporaer Jan 8, 2025
00e49aa
minor
temporaer Jan 8, 2025
7eee7ed
docstr
temporaer Jan 8, 2025
f0af8d0
linting
temporaer Jan 9, 2025
26a9a98
update api.py
temporaer Jan 9, 2025
bc3a19a
simplify
temporaer Jan 9, 2025
7037138
WIP
temporaer Jan 9, 2025
cb2f8cc
WIP
temporaer Jan 9, 2025
d0fe591
WIP
temporaer Jan 9, 2025
27a8a64
api tests passing
temporaer Jan 9, 2025
b8f473a
fix
temporaer Jan 9, 2025
afdc6a4
minor
temporaer Jan 9, 2025
30c7e35
Add missing dev dependencies
wesselb Jan 9, 2025
c173425
Fix formatting
wesselb Jan 9, 2025
06ee747
Test blob storage protocol also for Docker image
wesselb Jan 9, 2025
9930555
simplify submission spec
temporaer Jan 9, 2025
916d135
linting
temporaer Jan 9, 2025
acbf8db
auto-generate swagger-file
temporaer Jan 9, 2025
e3b670c
add make target to build docker image on ACR
temporaer Jan 10, 2025
386dbcd
Simplify test and use right Docker image
wesselb Jan 10, 2025
1060e4a
Rework interaction between client and host
wesselb Jan 10, 2025
61a543f
Improve URL parsing
wesselb Jan 10, 2025
81f46e1
Only require one communication channel
wesselb Jan 10, 2025
dc29d38
Simplify CI pipeline
wesselb Jan 10, 2025
472cbbf
Fix url parsing
wesselb Jan 10, 2025
23ceb53
Add some types
wesselb Jan 10, 2025
ad87695
Add test that uses a real blob storage container
wesselb Jan 10, 2025
993f8fd
Add Docker test for an actual container
wesselb Jan 10, 2025
d33c147
Remove deps from notebooks
wesselb Jan 10, 2025
27321cd
Add outline of demo
wesselb Jan 10, 2025
842dbc7
Let the demo depend on env vars
wesselb Jan 10, 2025
fd6ef69
Fix plotting
wesselb Jan 10, 2025
6de5ca9
Remove random data
wesselb Jan 10, 2025
22e8fd3
Increment version
wesselb Jan 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,24 @@ jobs:
name: Test with Python ${{ matrix.version }}
steps:
- uses: actions/checkout@v2

- name: Set up Python ${{ matrix.version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.version }}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build Foundry image
run: |
make docker
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade --no-cache-dir -e '.[dev]'
python -m pip install --upgrade pip
python -m pip install --upgrade --no-cache-dir -e '.[dev]'
- name: Run tests
run: |
pytest -v --cov=aurora --cov-report term-missing
make test
45 changes: 45 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Use an official Python runtime as a parent image.
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest

WORKDIR /aurora_foundry
COPY ./pyproject.toml .

# Assuming dependencies are fairly fixed, we can install them first and then copy the rest of the
# code to avoid re-installing dependencies when the code changes.
COPY _docker_requirements.txt .
RUN pip install --upgrade pip virtualenv && \
virtualenv venv -p python3.10 && \
. venv/bin/activate && \
pip install -r _docker_requirements.txt

# Download model weights.
RUN ./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-small-pretrained.ckpt")' && \
./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-finetuned.ckpt")'

COPY ./LICENSE.txt .
COPY ./README.md .

# Install `azcopy` and the AML inference server.
RUN wget https://aka.ms/downloadazcopy-v10-linux -O azcopy.tar.gz && \
cp $(tar -xvzf azcopy.tar.gz | grep azcopy$) /usr/local/bin/azcopy
RUN . ./venv/bin/activate && \
pip install azureml-inference-server-http

COPY ./aurora ./aurora
ARG AURORA_REPO_VERSION
RUN [ ! -z "${AURORA_REPO_VERSION}" ] || { echo "AURORA_REPO_VERSION must be set."; exit 1; } && \
. venv/bin/activate && \
SETUPTOOLS_SCM_PRETEND_VERSION="$AURORA_REPO_VERSION" pip install -e .

# Make port 5001 available to the world outside this container.
EXPOSE 5001
ENV PORT=5001

# we don't have a swagger2.json file, so we'll just "ignore" the version option and always return a version 3 file
RUN cp ./aurora/foundry/server/swagger3.json ./swagger2.json && \
cp ./aurora/foundry/server/swagger3.json ./swagger2.0.json && \
cp ./aurora/foundry/server/swagger3.json ./swagger3.1.json && \
cp ./aurora/foundry/server/swagger3.json ./swagger3.0.json && \
cp ./aurora/foundry/server/swagger3.json ./swagger3.json

CMD ["./venv/bin/azmlinfsrv", "--entry_script", "aurora/foundry/server/score.py"]
24 changes: 22 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,33 @@
.PHONY: install test docs
.PHONY: install test docs docker-requirements docker swagger-file

DOCKER_WS ?= testwsacr
DOCKER_IMAGE ?= aurora-foundry:20250110-2

install:
pip install --upgrade pip
pip install -e ".[dev]"
pre-commit install

test:
pytest tests -v --cov=aurora --cov-report=term --cov-report=html
DOCKER_IMAGE=$(DOCKER_WS).azurecr.io/$(DOCKER_IMAGE) pytest tests -v --cov=aurora --cov-report=term --cov-report=html

docs:
jupyter-book build docs
cp -r docs/_extras/* docs/_build/html/

docker-requirements: pyproject.toml
(pip show pip-tools 1>/dev/null) || pip install pip-tools
pip-compile --verbose --output-file _docker_requirements.txt pyproject.toml

docker:
(pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm
AURORA_REPO_VERSION=`python -m setuptools_scm` docker build --build-arg AURORA_REPO_VERSION -t $(DOCKER_WS).azurecr.io/$(DOCKER_IMAGE) .

docker-acr:
(pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm
[ ! -z "$(ACR)" ]
AURORA_REPO_VERSION=`python -m setuptools_scm` az acr build --build-arg AURORA_REPO_VERSION -r "$(ACR)" -t $(DOCKER_IMAGE) .

swagger-file:
pip install fastapi
python aurora/foundry/server/generate_swagger.py aurora/foundry/server/swagger3.json
137 changes: 137 additions & 0 deletions _docker_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --output-file=requirements.txt pyproject.toml
#
annotated-types==0.7.0
# via pydantic
certifi==2024.12.14
# via
# netcdf4
# requests
cftime==1.6.4.post1
# via netcdf4
charset-normalizer==3.4.1
# via requests
einops==0.8.0
# via microsoft-aurora (pyproject.toml)
filelock==3.16.1
# via
# huggingface-hub
# torch
# triton
fsspec==2024.12.0
# via
# huggingface-hub
# torch
huggingface-hub==0.27.1
# via
# microsoft-aurora (pyproject.toml)
# timm
idna==3.10
# via requests
jinja2==3.1.5
# via torch
markupsafe==3.0.2
# via jinja2
mpmath==1.3.0
# via sympy
netcdf4==1.7.2
# via microsoft-aurora (pyproject.toml)
networkx==3.4.2
# via torch
numpy==2.2.1
# via
# cftime
# microsoft-aurora (pyproject.toml)
# netcdf4
# pandas
# scipy
# torchvision
# xarray
nvidia-cublas-cu12==12.4.5.8
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.4.127
# via torch
nvidia-cuda-nvrtc-cu12==12.4.127
# via torch
nvidia-cuda-runtime-cu12==12.4.127
# via torch
nvidia-cudnn-cu12==9.1.0.70
# via torch
nvidia-cufft-cu12==11.2.1.3
# via torch
nvidia-curand-cu12==10.3.5.147
# via torch
nvidia-cusolver-cu12==11.6.1.9
# via torch
nvidia-cusparse-cu12==12.3.1.170
# via
# nvidia-cusolver-cu12
# torch
nvidia-nccl-cu12==2.21.5
# via torch
nvidia-nvjitlink-cu12==12.4.127
# via
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
# torch
nvidia-nvtx-cu12==12.4.127
# via torch
packaging==24.2
# via
# huggingface-hub
# xarray
pandas==2.2.3
# via xarray
pillow==11.1.0
# via torchvision
pydantic==2.10.4
# via microsoft-aurora (pyproject.toml)
pydantic-core==2.27.2
# via pydantic
python-dateutil==2.9.0.post0
# via pandas
pytz==2024.2
# via pandas
pyyaml==6.0.2
# via
# huggingface-hub
# timm
requests==2.32.3
# via huggingface-hub
scipy==1.15.0
# via microsoft-aurora (pyproject.toml)
six==1.17.0
# via python-dateutil
sympy==1.13.1
# via torch
timm==0.6.13
# via microsoft-aurora (pyproject.toml)
torch==2.5.1
# via
# microsoft-aurora (pyproject.toml)
# timm
# torchvision
torchvision==0.20.1
# via timm
tqdm==4.67.1
# via huggingface-hub
triton==3.1.0
# via torch
typing-extensions==4.12.2
# via
# huggingface-hub
# pydantic
# pydantic-core
# torch
tzdata==2024.2
# via pandas
urllib3==2.3.0
# via requests
xarray==2025.1.0
# via microsoft-aurora (pyproject.toml)
12 changes: 12 additions & 0 deletions aurora/foundry/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Copyright (c) Microsoft Corporation. Licensed under the MIT license."""

from aurora.foundry.client.api import SubmissionError, submit
from aurora.foundry.client.foundry import FoundryClient
from aurora.foundry.common.channel import BlobStorageChannel

__all__ = [
"BlobStorageChannel",
"FoundryClient",
"submit",
"SubmissionError",
]
1 change: 1 addition & 0 deletions aurora/foundry/client/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Copyright (c) Microsoft Corporation. Licensed under the MIT license."""
114 changes: 114 additions & 0 deletions aurora/foundry/client/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.
This is the API that the end user uses to submit jobs to the model running on Azure AI Foundry.
"""

import logging
from typing import Generator

from pydantic import BaseModel

from aurora import Batch
from aurora.foundry.client.foundry import AbstractFoundryClient
from aurora.foundry.common.channel import CommunicationChannel, iterate_prediction_files
from aurora.foundry.common.model import models

__all__ = ["SubmissionError", "submit"]

logger = logging.getLogger(__name__)


class CreationInfo(BaseModel):
task_id: str


class TaskInfo(BaseModel):
task_id: str
completed: bool
progress_percentage: int
success: bool | None
submitted: bool
status: str


class SubmissionError(Exception):
"""The submission could not be completed for some reason."""


def submit(
batch: Batch,
model_name: str,
num_steps: int,
channel: CommunicationChannel,
foundry_client: AbstractFoundryClient,
) -> Generator[Batch, None, None]:
"""Submit a request to Azure AI Foundry and retrieve the predictions.
Args:
batch (:class:`aurora.Batch`): Initial condition.
model_name (str): Name of the model. This name must be available in
:mod:`aurora_foundry.common.model`.
num_steps (int): Number of prediction steps.
channel (:class:`aurora_foundry.common.channel.CommunicationChannel`): Channel to use for
sending and receiving data.
foundry_client (:class:`aurora_foundry.client.foundry.AbstractFoundryClient`): Client to
communicate with Azure Foundry AI.
Yields:
:class:`aurora.Batch`: Predictions.
"""
if model_name not in models:
raise KeyError(f"Model `{model_name}` is not a valid model.")

# Create a task at the endpoint.
task = {
"model_name": model_name,
"num_steps": num_steps,
"data_folder_uri": channel.to_spec(),
}
response = foundry_client.submit_task(task)
try:
submission_info = CreationInfo(**response)
except Exception as e:
raise SubmissionError("Failed to create task.") from e
task_id = submission_info.task_id
logger.info(f"Created task `{task_id}` at endpoint.")

# Send the initial condition over.
channel.send(batch, task_id, "input.nc")

previous_status: str = "No Status"
previous_progress: int = 0

while True:
# Check on the progress of the task. The first progress check will trigger the task to be
# submitted.
response = foundry_client.get_progress(task_id)
task_info = TaskInfo(**response)

if task_info.submitted:
# If the task has been submitted, we must be able to read the acknowledgement of the
# initial condition.
try:
channel.read(task_id, "input.nc.ack", timeout=120)
except TimeoutError as e:
raise SubmissionError("Could not read acknowledgement of initial condition.") from e

if task_info.status != previous_status:
logger.info(f"Task status update: {task_info.status}")
previous_status = task_info.status

if task_info.progress_percentage > previous_progress:
logger.info(f"Task progress update: {task_info.progress_percentage}%.")
previous_progress = task_info.progress_percentage

if task_info.completed:
if task_info.success:
logger.info("Task has been successfully completed!")
break
else:
raise SubmissionError(f"Task failed: {task_info.status}")

logger.info("Retrieving predictions.")
for prediction_name in iterate_prediction_files("prediction.nc", num_steps):
yield channel.receive(task_id, prediction_name)
Loading
Loading