Sentiment: Add working test pipeline

- Creates a SageMaker pipeline with a processing step only - Builds and pushes a Docker container to ECR used by the image
nico-corthorn · Nov 30, 2024 · 889e6c8 · 889e6c8
1 parent c6d6392
commit 889e6c8
Show file tree

Hide file tree

Showing 13 changed files with 591 additions and 6 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -43,10 +43,10 @@ jobs:
       env:
         AWS_SAM_STACK_NAME: ${{ secrets.AWS_SAM_STACK_NAME }}
       run: |
-        make sambuild
+        make build
     - name: SAM deploy
       if: success()
       env:
         AWS_SAM_STACK_NAME: ${{ secrets.AWS_SAM_STACK_NAME }}
       run: |
-        make samdeploy
+        make deploy
diff --git a/.gitignore b/.gitignore
@@ -147,3 +147,6 @@ db/viewer.ipynb
 
 # WRDS solutions
 esgtools/wrds
+
+# Package duplicates
+esgtools/sentiment/inference_pipeline/esgtools
diff --git a/Makefile b/Makefile
@@ -19,7 +19,7 @@ test:
 
 pre_pr: format lint test
 
-sambuild:
+build:
 	@echo "Cleaning previous build..."
 	rm -rf .aws-sam/build
 
@@ -41,11 +41,11 @@ sambuild:
 	@echo "Size of entire .aws-sam directory:"
 	du -sh .aws-sam
 
-samdeploy-local:
+deploy-local:
 	@echo "Deploying from local samconfig file..."
 	sam deploy --config-file samconfig.toml
 
-samdeploy:
+deploy:
 	@echo "Deploying..."
 	sam deploy \
 		--stack-name sam-app \
@@ -56,3 +56,10 @@ samdeploy:
 		--no-confirm-changeset \
 		--no-fail-on-empty-changeset \
 		--disable-rollback false
+
+push-sentiment-container:
+	chmod +x ./esgtools/sentiment/inference_pipeline/build_and_push.sh
+	./esgtools/sentiment/inference_pipeline/build_and_push.sh
+
+deploy-sentiment-pipeline:
+	python ./esgtools/sentiment/inference_pipeline/deploy_pipeline.py
diff --git a/dev_requirements.txt b/dev_requirements.txt
@@ -8,4 +8,7 @@ pytest==7.4.3
 pytest-cov==4.1.0
 
 # Deployment
-aws-sam-cli==1.127.0
+aws-sam-cli==1.127.0
+
+# SageMaker
+sagemaker==2.197.1
diff --git a/esgtools/sentiment/inference_pipeline/Dockerfile b/esgtools/sentiment/inference_pipeline/Dockerfile
@@ -0,0 +1,44 @@
+FROM --platform=linux/amd64 nvidia/cuda:11.8.0-base-ubuntu22.04
+
+# Set working directory
+WORKDIR /opt/ml/code
+
+# Install Python and basic dependencies
+RUN apt-get update && apt-get install -y \
+    python3-pip \
+    python3-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create necessary directories
+RUN mkdir -p /opt/ml/processing/input/code && \
+    mkdir -p /opt/ml/processing/output && \
+    chmod -R 777 /opt/ml/processing
+
+# Copy files individually to maintain structure
+COPY esgtools ./esgtools
+COPY setup.py .
+COPY lambda_requirements.txt .
+COPY preprocessing.py .
+COPY requirements.txt .
+
+# Install Python packages from requirements.txt
+RUN pip3 install -r requirements.txt
+
+# Install the package
+RUN pip3 install -e .
+
+# Make both python3 and python available
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+
+# Make everything accessible
+RUN chmod -R 755 /opt/ml/code
+
+# Optional: Print directory contents for debugging
+RUN echo "Contents of /opt/ml/code:" && \
+    ls -la /opt/ml/code && \
+    echo "Python path:" && \
+    python3 -c "import sys; print('\n'.join(sys.path))"
+
+ENV PYTHONPATH=/opt/ml/code
+
+CMD ["python3", "/opt/ml/code/preprocessing.py"]
diff --git a/esgtools/sentiment/inference_pipeline/build_and_push.sh b/esgtools/sentiment/inference_pipeline/build_and_push.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+set -e
+
+# Get AWS account ID and region using AWS CLI (uses existing credentials)
+AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
+AWS_REGION=$(aws configure get region)
+
+# Repository name
+REPOSITORY_NAME=sentiment-inference
+
+# Set up directory paths
+REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../../" && pwd )"
+PIPELINE_DIR="${REPO_DIR}/esgtools/sentiment/inference_pipeline"
+
+# Create ECR repository if it doesn't exist
+aws ecr describe-repositories --repository-names ${REPOSITORY_NAME} || \
+    aws ecr create-repository --repository-name ${REPOSITORY_NAME}
+
+# Login to ECR
+aws ecr get-login-password | docker login --username AWS --password-stdin "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
+
+# Create a temporary build directory
+BUILD_DIR=$(mktemp -d)
+echo "Created temporary build directory: ${BUILD_DIR}"
+
+# Copy required files to build directory
+echo "Copying files to build directory..."
+cp -r "${REPO_DIR}/esgtools" "${BUILD_DIR}/esgtools"
+cp "${REPO_DIR}/setup.py" "${BUILD_DIR}/setup.py"
+cp "${REPO_DIR}/lambda_requirements.txt" "${BUILD_DIR}/lambda_requirements.txt"
+cp "${PIPELINE_DIR}/Dockerfile" "${BUILD_DIR}/Dockerfile"
+cp "${PIPELINE_DIR}/requirements.txt" "${BUILD_DIR}/requirements.txt"
+cp "${PIPELINE_DIR}/preprocessing.py" "${BUILD_DIR}/preprocessing.py"
+
+# Debug: List contents of build directory
+echo "Contents of build directory:"
+ls -la "${BUILD_DIR}"
+
+# Build and tag the docker image
+echo "Building docker container..."
+docker build --platform linux/amd64 -t ${REPOSITORY_NAME} "${BUILD_DIR}"
+docker tag "${REPOSITORY_NAME}:latest" "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${REPOSITORY_NAME}:latest"
+
+# Push the image
+echo "Pushing docker container to ECR..."
+docker push "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${REPOSITORY_NAME}:latest"
+
+# Clean up
+echo "Cleaning up temporary files..."
+rm -rf "${BUILD_DIR}"
diff --git a/esgtools/sentiment/inference_pipeline/deploy_pipeline.py b/esgtools/sentiment/inference_pipeline/deploy_pipeline.py
@@ -0,0 +1,23 @@
+import boto3
+import sagemaker
+from sagemaker_pipeline import create_pipeline
+
+
+def deploy():
+    """Deploy the SageMaker pipeline."""
+    session = sagemaker.Session()
+
+    # Get role ARN using boto3 (uses existing credentials)
+    iam = boto3.client("iam")
+    role_arn = iam.get_role(RoleName="SageMaker-DataScientist")["Role"]["Arn"]
+
+    # Create and start the pipeline
+    pipeline = create_pipeline(role_arn=role_arn)
+    pipeline.upsert(role_arn=role_arn)
+    execution = pipeline.start()
+
+    print(f"Pipeline started. Execution ID: {execution.arn}")
+
+
+if __name__ == "__main__":
+    deploy()