Add README with detailed walkthrough, scripts for pipeline deployment.

awslabs · Jan 10, 2025 · 899479f · 899479f
1 parent b64d7ec
commit 899479f
Show file tree

Hide file tree

Showing 8 changed files with 871 additions and 29 deletions.
diff --git a/.github/workflow_scripts/lint_check.sh b/.github/workflow_scripts/lint_check.sh
@@ -1,9 +1,11 @@
-# Move to parent directory
-cd ../../
-
+#!/usr/bin/env bash
 set -ex
 
+# Move to repo root
+cd ../../
+
 pip install pylint==2.17.5
+
 pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/*.py
 pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/data/*.py
 pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/distributed/
@@ -21,3 +23,4 @@ pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/utils.py
 pylint --rcfile=./tests/lint/pylintrc ./tools/convert_feat_to_wholegraph.py
 
 pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/sagemaker/
+pylint --rcfile=./tests/lint/pylintrc  ./examples/sagemaker-pipelines-graphbolt/  --recursive y
diff --git a/examples/sagemaker-pipelines-graphbolt/Dockerfile.processing b/examples/sagemaker-pipelines-graphbolt/Dockerfile.processing
@@ -4,7 +4,7 @@ FROM public.ecr.aws/ubuntu/ubuntu:22.04
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
-RUN apt update && apt install -y \
+RUN apt-get update && apt-get install -y \
     axel \
     curl \
     python3 \
@@ -13,9 +13,9 @@ RUN apt update && apt install -y \
     unzip \
     && rm -rf /var/lib/apt/lists/*
 
-
+# Copy and install ripunzip
 COPY ripunzip_2.0.0-1_amd64.deb ripunzip_2.0.0-1_amd64.deb
-RUN  apt install -y ./ripunzip_2.0.0-1_amd64.deb
+RUN  apt-get install -y ./ripunzip_2.0.0-1_amd64.deb
 
 RUN python3 -m pip install --no-cache-dir --upgrade pip==24.3.1 && \
     python3 -m pip install --no-cache-dir \
@@ -25,14 +25,14 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip==24.3.1 && \
     tqdm==4.67.1 \
     tqdm-loggable==0.2
 
+# Install aws cli
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
     && unzip awscliv2.zip \
     && ./aws/install
 
-
 # Copy processing scripts
 COPY process_papers100M.sh /opt/ml/code/
-COPY convert_ogb_papers100M_to_gconstruct.py /opt/ml/code/
+COPY convert_ogb_papers100m_to_gconstruct.py /opt/ml/code/
 
 WORKDIR /opt/ml/code/
 

diff --git a/examples/sagemaker-pipelines-graphbolt/README.md b/examples/sagemaker-pipelines-graphbolt/README.md
diff --git a/examples/sagemaker-pipelines-graphbolt/analyze_training_time.py b/examples/sagemaker-pipelines-graphbolt/analyze_training_time.py
@@ -28,6 +28,7 @@
 
 
 def parse_args():
+    """Parse log analysis args."""
     parser = argparse.ArgumentParser(
         description="Analyze training epoch and eval time."
     )
@@ -259,17 +260,9 @@ def print_training_summary(
     if epochs_data:
         total_epochs = len(epochs_data)
         avg_time = sum(e["time"] for e in epochs_data) / total_epochs
-        min_time = min(epochs_data, key=lambda x: x["time"])
-        max_time = max(epochs_data, key=lambda x: x["time"])
 
         print(f"Total epochs completed: {total_epochs}")
         print(f"Average epoch time: {avg_time:.2f} seconds")
-        print(
-            f"Fastest epoch: Epoch {min_time['epoch']} ({min_time['time']:.2f} seconds)"
-        )
-        print(
-            f"Slowest epoch: Epoch {max_time['epoch']} ({max_time['time']:.2f} seconds)"
-        )
 
         if verbose:
             print("\nEpoch Details:")
@@ -283,17 +276,9 @@ def print_training_summary(
     if eval_data:
         total_evals = len(eval_data)
         avg_eval_time = sum(e["time"] for e in eval_data) / total_evals
-        min_eval = min(eval_data, key=lambda x: x["time"])
-        max_eval = max(eval_data, key=lambda x: x["time"])
 
         print(f"Total evaluations: {total_evals}")
         print(f"Average evaluation time: {avg_eval_time:.2f} seconds")
-        print(
-            f"Fastest evaluation: Step {min_eval['step']} ({min_eval['time']:.2f} seconds)"
-        )
-        print(
-            f"Slowest evaluation: Step {max_eval['step']} ({max_eval['time']:.2f} seconds)"
-        )
 
         if verbose:
             print("\nEvaluation Details:")

diff --git a/examples/sagemaker-pipelines-graphbolt/build_and_push_papers100M_image.sh b/examples/sagemaker-pipelines-graphbolt/build_and_push_papers100M_image.sh
@@ -9,11 +9,49 @@ cleanup() {
 }
 
 
-ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
-REGION=$(aws configure get region)
-REGION=${REGION:-us-east-1}
+die() {
+    local msg=$1
+    local code=${2-1} # default exit status 1
+    msg "$msg"
+    exit "$code"
+}
+
+parse_params() {
+    # default values of variables set from params
+    ACCOUNT=$(aws sts get-caller-identity --query Account --output text || true)
+    REGION=$(aws configure get region || true)
+    REGION=${REGION:-"us-east-1"}
+
+    while :; do
+        case "${1-}" in
+        -h | --help) usage ;;
+        -x | --verbose) set -x ;;
+        -a | --account)
+            ACCOUNT="${2-}"
+            shift
+            ;;
+        -r | --region)
+            REGION="${2-}"
+            shift
+            ;;
+        -?*) die "Unknown option: $1" ;;
+        *) break ;;
+        esac
+        shift
+    done
+
+    # check required params and arguments
+    [[ -z "${ACCOUNT-}" ]] && die "Missing required parameter: -a/--account <aws-account-id>"
+    [[ -z "${REGION-}" ]] && die "Missing required parameter: -r/--region <aws-region>"
+
+    return 0
+}
+
+parse_params "$@"
+
 IMAGE=papers100m-processor
 
+# Download ripunzip to copy to image
 curl -L -O https://github.com/google/ripunzip/releases/download/v2.0.0/ripunzip_2.0.0-1_amd64.deb
 
 # Auth to AWS public ECR gallery
@@ -22,7 +60,6 @@ aws ecr-public get-login-password --region $REGION | docker login --username AWS
 # Build and tag image
 docker build -f Dockerfile.processing -t $IMAGE .
 
-
 # Create repository if it doesn't exist
 echo "Getting or creating container repository: $IMAGE"
 if ! $(aws ecr describe-repositories --repository-names $IMAGE --region ${REGION} > /dev/null 2>&1); then

diff --git a/...t/convert_ogb_papers100M_to_gconstruct.py → ...t/convert_ogb_papers100m_to_gconstruct.py b/...t/convert_ogb_papers100M_to_gconstruct.py → ...t/convert_ogb_papers100m_to_gconstruct.py
@@ -1,3 +1,20 @@
+"""
+Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License").
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Convert papers100M data and prepare for input to GConstruct
+"""
 import argparse
 import gzip
 import json
@@ -10,13 +27,14 @@
 import pyarrow as pa
 import pyarrow.dataset as ds
 import pyarrow.parquet as pq
-import pyarrow.fs as fs
+from pyarrow import fs
 from tqdm_loggable.auto import tqdm
 
 # pylint: disable=logging-fstring-interpolation
 
 
 def parse_args():
+    """Parse conversion arguments."""
     parser = argparse.ArgumentParser(
         description="Convert raw OGB papers-100M data to GConstruct format"
     )

diff --git a/examples/sagemaker-pipelines-graphbolt/deploy_arxiv_pipeline.sh b/examples/sagemaker-pipelines-graphbolt/deploy_arxiv_pipeline.sh
@@ -0,0 +1,129 @@
+#!/bin/env bash
+set -euox pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd -P)
+
+msg() {
+    echo >&2 -e "${1-}"
+}
+
+die() {
+    local msg=$1
+    local code=${2-1} # default exit status 1
+    msg "$msg"
+    exit "$code"
+}
+
+parse_params() {
+    # default values of variables set from params
+    ACCOUNT=$(aws sts get-caller-identity --query Account --output text || true)
+    REGION=$(aws configure get region || true)
+    REGION=${REGION:-"us-east-1"}
+    PIPELINE_NAME=""
+
+
+    while :; do
+        case "${1-}" in
+        -h | --help) usage ;;
+        -x | --verbose) set -x ;;
+        -r | --role)
+            ROLE="${2-}"
+            shift
+            ;;
+        -a | --account)
+            ACCOUNT="${2-}"
+            shift
+            ;;
+        -b | --bucket)
+            BUCKET_NAME="${2-}"
+            shift
+            ;;
+        -n | --pipeline-name)
+            PIPELINE_NAME="${2-}"
+            shift
+            ;;
+        -g | --use-graphbolt)
+            USE_GRAPHBOLT="${2-}"
+            shift
+            ;;
+        -?*) die "Unknown option: $1" ;;
+        *) break ;;
+        esac
+        shift
+    done
+
+    # check required params and arguments
+    [[ -z "${ACCOUNT-}" ]] && die "Missing required parameter: -a/--account <aws-account-id>"
+    [[ -z "${BUCKET-}" ]] && die "Missing required parameter: -b/--bucket <s3-bucket>"
+    [[ -z "${ROLE-}" ]] && die "Missing required parameter: -r/--role <execution-role-arn>"
+    [[ -z "${USE_GRAPHBOLT-}" ]] && die "Missing required parameter: -g/--use-graphbolt <true|false>"
+
+    return 0
+}
+
+cleanup() {
+    trap - SIGINT SIGTERM ERR EXIT
+    # script cleanup here
+}
+
+parse_params "$@"
+
+DATASET_S3_PATH="s3://${BUCKET_NAME}/ogb-arxiv-input"
+OUTPUT_PATH="s3://${BUCKET_NAME}/pipelines-output"
+GRAPH_NAME="ogbn-arxiv"
+INSTANCE_COUNT="2"
+REGION="us-east-1"
+NUM_TRAINERS=4
+
+PARTITION_OUTPUT_JSON="$GRAPH_NAME.json"
+PARTITION_ALGORITHM="metis"
+GCONSTRUCT_INSTANCE="ml.r5.4xlarge"
+GCONSTRUCT_CONFIG="gconstruct_config_arxiv.json"
+
+TRAIN_CPU_INSTANCE="ml.m5.4xlarge"
+TRAIN_YAML_S3="s3://$BUCKET_NAME/yaml/arxiv_nc_train.yaml"
+INFERENCE_YAML_S3="s3://$BUCKET_NAME/yaml/arxiv_nc_inference.yaml"
+
+TASK_TYPE="node_classification"
+INFERENCE_MODEL_SNAPSHOT="epoch-9"
+
+JOBS_TO_RUN="gconstruct train inference"
+GSF_CPU_IMAGE_URI=${ACCOUNT}.dkr.ecr.$REGION.amazonaws.com/graphstorm:sagemaker-cpu
+GSF_GPU_IMAGE_URI=${ACCOUNT}.dkr.ecr.$REGION.amazonaws.com/graphstorm:sagemaker-gpu
+VOLUME_SIZE=50
+
+if [[ -z "${PIPELINE_NAME-}" ]]; then
+    if [[ $USE_GRAPHBOLT == "true" ]]; then
+        PIPELINE_NAME="ogbn-arxiv-gs-graphbolt-pipeline"
+    else
+        PIPELINE_NAME="ogbn-arxiv-gs-pipeline"
+    fi
+fi
+
+python3 $SCRIPT_DIR/../../sagemaker/pipeline/create_sm_pipeline.py \
+    --cpu-instance-type ${TRAIN_CPU_INSTANCE} \
+    --graph-construction-args "--num-processes 8" \
+    --graph-construction-instance-type ${GCONSTRUCT_INSTANCE} \
+    --graph-construction-config-filename ${GCONSTRUCT_CONFIG} \
+    --graph-name ${GRAPH_NAME} \
+    --graphstorm-pytorch-cpu-image-url "${GSF_CPU_IMAGE_URI}" \
+    --graphstorm-pytorch-gpu-image-url "${GSF_GPU_IMAGE_URI}" \
+    --inference-model-snapshot "${INFERENCE_MODEL_SNAPSHOT}" \
+    --inference-yaml-s3 ${INFERENCE_YAML_S3} \
+    --input-data-s3 ${DATASET_S3_PATH} \
+    --instance-count ${INSTANCE_COUNT} \
+    --jobs-to-run ${JOBS_TO_RUN} \
+    --num-trainers ${NUM_TRAINERS} \
+    --output-prefix-s3 "${OUTPUT_PATH}" \
+    --pipeline-name "${PIPELINE_NAME}" \
+    --partition-output-json ${PARTITION_OUTPUT_JSON} \
+    --partition-algorithm ${PARTITION_ALGORITHM} \
+    --region ${REGION} \
+    --role "${ROLE}" \
+    --train-on-cpu \
+    --train-inference-task ${TASK_TYPE} \
+    --train-yaml-s3 "${TRAIN_YAML_S3}" \
+    --save-embeddings \
+    --save-predictions \
+    --volume-size-gb ${VOLUME_SIZE} \
+    --use-graphbolt "${USE_GRAPHBOLT}"