Skip to content

Commit

Permalink
Add README with detailed walkthrough, scripts for pipeline deployment.
Browse files Browse the repository at this point in the history
  • Loading branch information
thvasilo committed Jan 10, 2025
1 parent b64d7ec commit 899479f
Show file tree
Hide file tree
Showing 8 changed files with 871 additions and 29 deletions.
9 changes: 6 additions & 3 deletions .github/workflow_scripts/lint_check.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Move to parent directory
cd ../../

#!/usr/bin/env bash
set -ex

# Move to repo root
cd ../../

pip install pylint==2.17.5

pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/*.py
pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/data/*.py
pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/distributed/
Expand All @@ -21,3 +23,4 @@ pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/utils.py
pylint --rcfile=./tests/lint/pylintrc ./tools/convert_feat_to_wholegraph.py

pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/sagemaker/
pylint --rcfile=./tests/lint/pylintrc ./examples/sagemaker-pipelines-graphbolt/ --recursive y
10 changes: 5 additions & 5 deletions examples/sagemaker-pipelines-graphbolt/Dockerfile.processing
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ FROM public.ecr.aws/ubuntu/ubuntu:22.04
ENV DEBIAN_FRONTEND=noninteractive

# Install Python and other dependencies
RUN apt update && apt install -y \
RUN apt-get update && apt-get install -y \
axel \
curl \
python3 \
Expand All @@ -13,9 +13,9 @@ RUN apt update && apt install -y \
unzip \
&& rm -rf /var/lib/apt/lists/*


# Copy and install ripunzip
COPY ripunzip_2.0.0-1_amd64.deb ripunzip_2.0.0-1_amd64.deb
RUN apt install -y ./ripunzip_2.0.0-1_amd64.deb
RUN apt-get install -y ./ripunzip_2.0.0-1_amd64.deb

RUN python3 -m pip install --no-cache-dir --upgrade pip==24.3.1 && \
python3 -m pip install --no-cache-dir \
Expand All @@ -25,14 +25,14 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip==24.3.1 && \
tqdm==4.67.1 \
tqdm-loggable==0.2

# Install aws cli
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
&& unzip awscliv2.zip \
&& ./aws/install


# Copy processing scripts
COPY process_papers100M.sh /opt/ml/code/
COPY convert_ogb_papers100M_to_gconstruct.py /opt/ml/code/
COPY convert_ogb_papers100m_to_gconstruct.py /opt/ml/code/

WORKDIR /opt/ml/code/

Expand Down
531 changes: 531 additions & 0 deletions examples/sagemaker-pipelines-graphbolt/README.md

Large diffs are not rendered by default.

17 changes: 1 addition & 16 deletions examples/sagemaker-pipelines-graphbolt/analyze_training_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@


def parse_args():
"""Parse log analysis args."""
parser = argparse.ArgumentParser(
description="Analyze training epoch and eval time."
)
Expand Down Expand Up @@ -259,17 +260,9 @@ def print_training_summary(
if epochs_data:
total_epochs = len(epochs_data)
avg_time = sum(e["time"] for e in epochs_data) / total_epochs
min_time = min(epochs_data, key=lambda x: x["time"])
max_time = max(epochs_data, key=lambda x: x["time"])

print(f"Total epochs completed: {total_epochs}")
print(f"Average epoch time: {avg_time:.2f} seconds")
print(
f"Fastest epoch: Epoch {min_time['epoch']} ({min_time['time']:.2f} seconds)"
)
print(
f"Slowest epoch: Epoch {max_time['epoch']} ({max_time['time']:.2f} seconds)"
)

if verbose:
print("\nEpoch Details:")
Expand All @@ -283,17 +276,9 @@ def print_training_summary(
if eval_data:
total_evals = len(eval_data)
avg_eval_time = sum(e["time"] for e in eval_data) / total_evals
min_eval = min(eval_data, key=lambda x: x["time"])
max_eval = max(eval_data, key=lambda x: x["time"])

print(f"Total evaluations: {total_evals}")
print(f"Average evaluation time: {avg_eval_time:.2f} seconds")
print(
f"Fastest evaluation: Step {min_eval['step']} ({min_eval['time']:.2f} seconds)"
)
print(
f"Slowest evaluation: Step {max_eval['step']} ({max_eval['time']:.2f} seconds)"
)

if verbose:
print("\nEvaluation Details:")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,49 @@ cleanup() {
}


ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
REGION=$(aws configure get region)
REGION=${REGION:-us-east-1}
die() {
local msg=$1
local code=${2-1} # default exit status 1
msg "$msg"
exit "$code"
}

parse_params() {
# default values of variables set from params
ACCOUNT=$(aws sts get-caller-identity --query Account --output text || true)
REGION=$(aws configure get region || true)
REGION=${REGION:-"us-east-1"}

while :; do
case "${1-}" in
-h | --help) usage ;;
-x | --verbose) set -x ;;
-a | --account)
ACCOUNT="${2-}"
shift
;;
-r | --region)
REGION="${2-}"
shift
;;
-?*) die "Unknown option: $1" ;;
*) break ;;
esac
shift
done

# check required params and arguments
[[ -z "${ACCOUNT-}" ]] && die "Missing required parameter: -a/--account <aws-account-id>"
[[ -z "${REGION-}" ]] && die "Missing required parameter: -r/--region <aws-region>"

return 0
}

parse_params "$@"

IMAGE=papers100m-processor

# Download ripunzip to copy to image
curl -L -O https://github.com/google/ripunzip/releases/download/v2.0.0/ripunzip_2.0.0-1_amd64.deb

# Auth to AWS public ECR gallery
Expand All @@ -22,7 +60,6 @@ aws ecr-public get-login-password --region $REGION | docker login --username AWS
# Build and tag image
docker build -f Dockerfile.processing -t $IMAGE .


# Create repository if it doesn't exist
echo "Getting or creating container repository: $IMAGE"
if ! $(aws ecr describe-repositories --repository-names $IMAGE --region ${REGION} > /dev/null 2>&1); then
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
"""
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License").
You may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Convert papers100M data and prepare for input to GConstruct
"""
import argparse
import gzip
import json
Expand All @@ -10,13 +27,14 @@
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pyarrow.fs as fs
from pyarrow import fs
from tqdm_loggable.auto import tqdm

# pylint: disable=logging-fstring-interpolation


def parse_args():
"""Parse conversion arguments."""
parser = argparse.ArgumentParser(
description="Convert raw OGB papers-100M data to GConstruct format"
)
Expand Down
129 changes: 129 additions & 0 deletions examples/sagemaker-pipelines-graphbolt/deploy_arxiv_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/bin/env bash
set -euox pipefail

SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd -P)

msg() {
echo >&2 -e "${1-}"
}

die() {
local msg=$1
local code=${2-1} # default exit status 1
msg "$msg"
exit "$code"
}

parse_params() {
# default values of variables set from params
ACCOUNT=$(aws sts get-caller-identity --query Account --output text || true)
REGION=$(aws configure get region || true)
REGION=${REGION:-"us-east-1"}
PIPELINE_NAME=""


while :; do
case "${1-}" in
-h | --help) usage ;;
-x | --verbose) set -x ;;
-r | --role)
ROLE="${2-}"
shift
;;
-a | --account)
ACCOUNT="${2-}"
shift
;;
-b | --bucket)
BUCKET_NAME="${2-}"
shift
;;
-n | --pipeline-name)
PIPELINE_NAME="${2-}"
shift
;;
-g | --use-graphbolt)
USE_GRAPHBOLT="${2-}"
shift
;;
-?*) die "Unknown option: $1" ;;
*) break ;;
esac
shift
done

# check required params and arguments
[[ -z "${ACCOUNT-}" ]] && die "Missing required parameter: -a/--account <aws-account-id>"
[[ -z "${BUCKET-}" ]] && die "Missing required parameter: -b/--bucket <s3-bucket>"
[[ -z "${ROLE-}" ]] && die "Missing required parameter: -r/--role <execution-role-arn>"
[[ -z "${USE_GRAPHBOLT-}" ]] && die "Missing required parameter: -g/--use-graphbolt <true|false>"

return 0
}

cleanup() {
trap - SIGINT SIGTERM ERR EXIT
# script cleanup here
}

parse_params "$@"

DATASET_S3_PATH="s3://${BUCKET_NAME}/ogb-arxiv-input"
OUTPUT_PATH="s3://${BUCKET_NAME}/pipelines-output"
GRAPH_NAME="ogbn-arxiv"
INSTANCE_COUNT="2"
REGION="us-east-1"
NUM_TRAINERS=4

PARTITION_OUTPUT_JSON="$GRAPH_NAME.json"
PARTITION_ALGORITHM="metis"
GCONSTRUCT_INSTANCE="ml.r5.4xlarge"
GCONSTRUCT_CONFIG="gconstruct_config_arxiv.json"

TRAIN_CPU_INSTANCE="ml.m5.4xlarge"
TRAIN_YAML_S3="s3://$BUCKET_NAME/yaml/arxiv_nc_train.yaml"
INFERENCE_YAML_S3="s3://$BUCKET_NAME/yaml/arxiv_nc_inference.yaml"

TASK_TYPE="node_classification"
INFERENCE_MODEL_SNAPSHOT="epoch-9"

JOBS_TO_RUN="gconstruct train inference"
GSF_CPU_IMAGE_URI=${ACCOUNT}.dkr.ecr.$REGION.amazonaws.com/graphstorm:sagemaker-cpu
GSF_GPU_IMAGE_URI=${ACCOUNT}.dkr.ecr.$REGION.amazonaws.com/graphstorm:sagemaker-gpu
VOLUME_SIZE=50

if [[ -z "${PIPELINE_NAME-}" ]]; then
if [[ $USE_GRAPHBOLT == "true" ]]; then
PIPELINE_NAME="ogbn-arxiv-gs-graphbolt-pipeline"
else
PIPELINE_NAME="ogbn-arxiv-gs-pipeline"
fi
fi

python3 $SCRIPT_DIR/../../sagemaker/pipeline/create_sm_pipeline.py \
--cpu-instance-type ${TRAIN_CPU_INSTANCE} \
--graph-construction-args "--num-processes 8" \
--graph-construction-instance-type ${GCONSTRUCT_INSTANCE} \
--graph-construction-config-filename ${GCONSTRUCT_CONFIG} \
--graph-name ${GRAPH_NAME} \
--graphstorm-pytorch-cpu-image-url "${GSF_CPU_IMAGE_URI}" \
--graphstorm-pytorch-gpu-image-url "${GSF_GPU_IMAGE_URI}" \
--inference-model-snapshot "${INFERENCE_MODEL_SNAPSHOT}" \
--inference-yaml-s3 ${INFERENCE_YAML_S3} \
--input-data-s3 ${DATASET_S3_PATH} \
--instance-count ${INSTANCE_COUNT} \
--jobs-to-run ${JOBS_TO_RUN} \
--num-trainers ${NUM_TRAINERS} \
--output-prefix-s3 "${OUTPUT_PATH}" \
--pipeline-name "${PIPELINE_NAME}" \
--partition-output-json ${PARTITION_OUTPUT_JSON} \
--partition-algorithm ${PARTITION_ALGORITHM} \
--region ${REGION} \
--role "${ROLE}" \
--train-on-cpu \
--train-inference-task ${TASK_TYPE} \
--train-yaml-s3 "${TRAIN_YAML_S3}" \
--save-embeddings \
--save-predictions \
--volume-size-gb ${VOLUME_SIZE} \
--use-graphbolt "${USE_GRAPHBOLT}"
Loading

0 comments on commit 899479f

Please sign in to comment.