From 889e6c81c4b14281e7efed734a4f7cffd20461f7 Mon Sep 17 00:00:00 2001
From: Nicolas Corthorn <nicolas.corthorn@gmail.com>
Date: Fri, 29 Nov 2024 18:06:41 -0800
Subject: [PATCH] Sentiment: Add working test pipeline - Creates a SageMaker
 pipeline with a processing step only - Builds and pushes a Docker container
 to ECR used by the image

---
 .github/workflows/main.yml                    |   4 +-
 .gitignore                                    |   3 +
 Makefile                                      |  13 +-
 dev_requirements.txt                          |   5 +-
 .../sentiment/inference_pipeline/Dockerfile   |  44 ++++
 .../inference_pipeline/build_and_push.sh      |  50 ++++
 .../inference_pipeline/deploy_pipeline.py     |  23 ++
 .../sentiment/inference_pipeline/inference.py | 217 ++++++++++++++++++
 .../inference_pipeline/preprocessing.py       |  97 ++++++++
 .../inference_pipeline/requirements.txt       |   8 +
 .../inference_pipeline/sagemaker_pipeline.py  |  99 ++++++++
 .../inference_pipeline/test_container.sh      |  33 +++
 setup.py                                      |   1 +
 13 files changed, 591 insertions(+), 6 deletions(-)
 create mode 100644 esgtools/sentiment/inference_pipeline/Dockerfile
 create mode 100755 esgtools/sentiment/inference_pipeline/build_and_push.sh
 create mode 100644 esgtools/sentiment/inference_pipeline/deploy_pipeline.py
 create mode 100644 esgtools/sentiment/inference_pipeline/inference.py
 create mode 100644 esgtools/sentiment/inference_pipeline/preprocessing.py
 create mode 100644 esgtools/sentiment/inference_pipeline/requirements.txt
 create mode 100644 esgtools/sentiment/inference_pipeline/sagemaker_pipeline.py
 create mode 100755 esgtools/sentiment/inference_pipeline/test_container.sh

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index bfcbd2c..6344cf2 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -43,10 +43,10 @@ jobs:
       env:
         AWS_SAM_STACK_NAME: ${{ secrets.AWS_SAM_STACK_NAME }}
       run: |
-        make sambuild
+        make build
     - name: SAM deploy
       if: success()
       env:
         AWS_SAM_STACK_NAME: ${{ secrets.AWS_SAM_STACK_NAME }}
       run: |
-        make samdeploy
+        make deploy
diff --git a/.gitignore b/.gitignore
index 7180bf0..5ba2d1e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -147,3 +147,6 @@ db/viewer.ipynb
 
 # WRDS solutions
 esgtools/wrds
+
+# Package duplicates
+esgtools/sentiment/inference_pipeline/esgtools
diff --git a/Makefile b/Makefile
index 64845ed..57b068c 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ test:
 
 pre_pr: format lint test
 
-sambuild:
+build:
 	@echo "Cleaning previous build..."
 	rm -rf .aws-sam/build
 
@@ -41,11 +41,11 @@ sambuild:
 	@echo "Size of entire .aws-sam directory:"
 	du -sh .aws-sam
 
-samdeploy-local:
+deploy-local:
 	@echo "Deploying from local samconfig file..."
 	sam deploy --config-file samconfig.toml
 
-samdeploy:
+deploy:
 	@echo "Deploying..."
 	sam deploy \
 		--stack-name sam-app \
@@ -56,3 +56,10 @@ samdeploy:
 		--no-confirm-changeset \
 		--no-fail-on-empty-changeset \
 		--disable-rollback false
+
+push-sentiment-container:
+	chmod +x ./esgtools/sentiment/inference_pipeline/build_and_push.sh
+	./esgtools/sentiment/inference_pipeline/build_and_push.sh
+
+deploy-sentiment-pipeline:
+	python ./esgtools/sentiment/inference_pipeline/deploy_pipeline.py
\ No newline at end of file
diff --git a/dev_requirements.txt b/dev_requirements.txt
index fcf346c..11f36ef 100644
--- a/dev_requirements.txt
+++ b/dev_requirements.txt
@@ -8,4 +8,7 @@ pytest==7.4.3
 pytest-cov==4.1.0
 
 # Deployment
-aws-sam-cli==1.127.0
\ No newline at end of file
+aws-sam-cli==1.127.0
+
+# SageMaker
+sagemaker==2.197.1
\ No newline at end of file
diff --git a/esgtools/sentiment/inference_pipeline/Dockerfile b/esgtools/sentiment/inference_pipeline/Dockerfile
new file mode 100644
index 0000000..5096952
--- /dev/null
+++ b/esgtools/sentiment/inference_pipeline/Dockerfile
@@ -0,0 +1,44 @@
+FROM --platform=linux/amd64 nvidia/cuda:11.8.0-base-ubuntu22.04
+
+# Set working directory
+WORKDIR /opt/ml/code
+
+# Install Python and basic dependencies
+RUN apt-get update && apt-get install -y \
+    python3-pip \
+    python3-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create necessary directories
+RUN mkdir -p /opt/ml/processing/input/code && \
+    mkdir -p /opt/ml/processing/output && \
+    chmod -R 777 /opt/ml/processing
+
+# Copy files individually to maintain structure
+COPY esgtools ./esgtools
+COPY setup.py .
+COPY lambda_requirements.txt .
+COPY preprocessing.py .
+COPY requirements.txt .
+
+# Install Python packages from requirements.txt
+RUN pip3 install -r requirements.txt
+
+# Install the package
+RUN pip3 install -e .
+
+# Make both python3 and python available
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+
+# Make everything accessible
+RUN chmod -R 755 /opt/ml/code
+
+# Optional: Print directory contents for debugging
+RUN echo "Contents of /opt/ml/code:" && \
+    ls -la /opt/ml/code && \
+    echo "Python path:" && \
+    python3 -c "import sys; print('\n'.join(sys.path))"
+
+ENV PYTHONPATH=/opt/ml/code
+
+CMD ["python3", "/opt/ml/code/preprocessing.py"]
\ No newline at end of file
diff --git a/esgtools/sentiment/inference_pipeline/build_and_push.sh b/esgtools/sentiment/inference_pipeline/build_and_push.sh
new file mode 100755
index 0000000..4014fc6
--- /dev/null
+++ b/esgtools/sentiment/inference_pipeline/build_and_push.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+set -e
+
+# Get AWS account ID and region using AWS CLI (uses existing credentials)
+AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
+AWS_REGION=$(aws configure get region)
+
+# Repository name
+REPOSITORY_NAME=sentiment-inference
+
+# Set up directory paths
+REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../../" && pwd )"
+PIPELINE_DIR="${REPO_DIR}/esgtools/sentiment/inference_pipeline"
+
+# Create ECR repository if it doesn't exist
+aws ecr describe-repositories --repository-names ${REPOSITORY_NAME} || \
+    aws ecr create-repository --repository-name ${REPOSITORY_NAME}
+
+# Login to ECR
+aws ecr get-login-password | docker login --username AWS --password-stdin "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
+
+# Create a temporary build directory
+BUILD_DIR=$(mktemp -d)
+echo "Created temporary build directory: ${BUILD_DIR}"
+
+# Copy required files to build directory
+echo "Copying files to build directory..."
+cp -r "${REPO_DIR}/esgtools" "${BUILD_DIR}/esgtools"
+cp "${REPO_DIR}/setup.py" "${BUILD_DIR}/setup.py"
+cp "${REPO_DIR}/lambda_requirements.txt" "${BUILD_DIR}/lambda_requirements.txt"
+cp "${PIPELINE_DIR}/Dockerfile" "${BUILD_DIR}/Dockerfile"
+cp "${PIPELINE_DIR}/requirements.txt" "${BUILD_DIR}/requirements.txt"
+cp "${PIPELINE_DIR}/preprocessing.py" "${BUILD_DIR}/preprocessing.py"
+
+# Debug: List contents of build directory
+echo "Contents of build directory:"
+ls -la "${BUILD_DIR}"
+
+# Build and tag the docker image
+echo "Building docker container..."
+docker build --platform linux/amd64 -t ${REPOSITORY_NAME} "${BUILD_DIR}"
+docker tag "${REPOSITORY_NAME}:latest" "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${REPOSITORY_NAME}:latest"
+
+# Push the image
+echo "Pushing docker container to ECR..."
+docker push "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${REPOSITORY_NAME}:latest"
+
+# Clean up
+echo "Cleaning up temporary files..."
+rm -rf "${BUILD_DIR}"
\ No newline at end of file
diff --git a/esgtools/sentiment/inference_pipeline/deploy_pipeline.py b/esgtools/sentiment/inference_pipeline/deploy_pipeline.py
new file mode 100644
index 0000000..b1b824b
--- /dev/null
+++ b/esgtools/sentiment/inference_pipeline/deploy_pipeline.py
@@ -0,0 +1,23 @@
+import boto3
+import sagemaker
+from sagemaker_pipeline import create_pipeline
+
+
+def deploy():
+    """Deploy the SageMaker pipeline."""
+    session = sagemaker.Session()
+
+    # Get role ARN using boto3 (uses existing credentials)
+    iam = boto3.client("iam")
+    role_arn = iam.get_role(RoleName="SageMaker-DataScientist")["Role"]["Arn"]
+
+    # Create and start the pipeline
+    pipeline = create_pipeline(role_arn=role_arn)
+    pipeline.upsert(role_arn=role_arn)
+    execution = pipeline.start()
+
+    print(f"Pipeline started. Execution ID: {execution.arn}")
+
+
+if __name__ == "__main__":
+    deploy()
diff --git a/esgtools/sentiment/inference_pipeline/inference.py b/esgtools/sentiment/inference_pipeline/inference.py
new file mode 100644
index 0000000..2081506
--- /dev/null
+++ b/esgtools/sentiment/inference_pipeline/inference.py
@@ -0,0 +1,217 @@
+import json
+import os
+
+import pandas as pd
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from esgtools.domain_models.io import convert_dict_to_sql_params
+from esgtools.utils import aws, sql_manager, utils
+
+
+# Your existing functions here
+def remove_non_letters_except_spaces(input_string):
+    return re.sub(r"[^a-zA-Z\s]", "", input_string)
+
+
+def create_prompt(headline, snippet):
+    """Create a standardized prompt for sentiment analysis."""
+    return f"""
+You are a financial analyst tasked with analyzing news about a specific company. For each news headline and snippet, your job is to determine whether the news is positive, neutral, negative, or unknown for the company's future and its stock price in particular. Respond only with one of these three words: "positive", "neutral", or "negative".
+
+Here is the criteria for each label:
+- positive: the news is likely to have a positive impact on the stock price
+- neutral: the news is likely to have little to no impact on the stock price
+- negative: the news is likely to have a negative impact on the stock price
+
+Provide no explanations, code, or additional information—just the single word answer.
+
+Here are some examples:
+
+News Input:
+
+Credit Suisse Profit Rose 36% in Quarter
+The figures beat estimates because costs were lower than expected at the investment bank and revenue was higher.
+
+Answer:
+positive
+
+News Input:
+
+Apple Confirms November Event
+Apple has confirmed it will hold a product launch event on November 1st, but provided no details about what will be announced.
+
+Answer:
+neutral
+
+News Input:
+
+Ford May Produce Its Own Reality TV Show
+Ford is pitching a reality show where aspiring car designers would compete to design the next hot Ford vehicle.
+
+Answer:
+neutral
+
+News Input:
+
+Merck Admits a Data Error on Vioxx
+Merck said that it erred when it reported in early 2005 that a crucial statistical test showed that Vioxx caused heart problems only after 18 months of continuous use.
+
+Answer:
+negative
+
+News Input:
+
+Profit Falls as Sales Rise at Verizon
+Verizon said its profit dipped as it absorbed the costs of integrating MCI and building a fiber optic network designed to deliver television to homes.
+
+Answer:
+negative
+
+
+Now, analyze this new input:
+
+News Input:
+{headline}
+{snippet}
+
+Answer:
+"""
+
+
+def run_sentiment_analysis(nyt_df, model, tokenizer):
+    """
+    Run sentiment analysis on the entire DataFrame with output validation.
+
+    Args:
+        nyt_df (pd.DataFrame): Input DataFrame containing 'headline' and 'snippet' columns
+        model: The loaded LLaMA model
+        tokenizer: The loaded tokenizer
+
+    Returns:
+        pd.DataFrame: Results DataFrame with sentiment analysis
+    """
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+
+    # Valid sentiment labels
+    VALID_SENTIMENTS = {"positive", "neutral", "negative"}
+    MAX_RETRIES = 3
+
+    results = []
+
+    # Process each article with progress bar
+    for idx, row in tqdm(
+        nyt_df.iterrows(), total=len(nyt_df), desc="Processing articles"
+    ):
+        sentiment = None
+        retries = 0
+
+        while sentiment not in VALID_SENTIMENTS and retries < MAX_RETRIES:
+            prompt = create_prompt(row["headline"], row["snippet"])
+
+            # Tokenize the input prompt
+            inputs = tokenizer(prompt, return_tensors="pt").to(device)
+
+            # Generate output from the model with more tokens to ensure complete response
+            with torch.no_grad():
+                output = model.generate(
+                    **inputs,
+                    max_new_tokens=4,
+                    do_sample=True,
+                    # temperature=0.3,
+                    num_return_sequences=1,
+                    pad_token_id=tokenizer.eos_token_id,
+                    eos_token_id=tokenizer.eos_token_id,
+                )
+
+            # Decode the generated tokens into text
+            generated_tokens = output[0][inputs["input_ids"].shape[1] :]
+            original_response = (
+                tokenizer.decode(generated_tokens, skip_special_tokens=True)
+                .strip()
+                .lower()
+            )
+
+            # Clean up the response
+            # Replace line break
+            response = original_response.replace("\n", " ")
+            # Remove non-letter characters
+            response = remove_non_letters_except_spaces(response)
+            # Remove common prefixes that might appear
+            prefixes_to_remove = ["answer:", "answer"]
+            for prefix in prefixes_to_remove:
+                if response.startswith(prefix):
+                    response = response[len(prefix) :].strip()
+
+            # Extract the first word as sentiment
+            sentiment = response.split()[0] if response else None
+
+            # Validate sentiment
+            if sentiment not in VALID_SENTIMENTS:
+                retries += 1
+                print(
+                    f"\nInvalid response '{response}' for article {idx}. Retry {retries}/{MAX_RETRIES}"
+                )
+
+        # If still invalid after retries, default to 'neutral'
+        if sentiment not in VALID_SENTIMENTS:
+            print(
+                f"\nWarning: Could not get valid sentiment for article {idx} after {MAX_RETRIES} retries. Default to 'neutral'"
+            )
+            sentiment = "neutral"
+
+        results.append(
+            {
+                "id": idx,
+                "headline": row["headline"],
+                "snippet": row["snippet"],
+                "output": original_response,
+                "sentiment": sentiment,
+                "retries": retries,
+            }
+        )
+
+        # Clear CUDA cache periodically
+        if idx % 100 == 0 and device == "cuda":
+            torch.cuda.empty_cache()
+
+    return pd.DataFrame(results)
+
+
+def model_fn(model_dir):
+    """Load the model for inference."""
+    repo_id = "meta-llama/Llama-3.1-8B-Instruct"
+    hf_token = os.environ.get("HF_TOKEN")  # Will be passed as env variable
+
+    tokenizer = AutoTokenizer.from_pretrained(repo_id, use_auth_token=hf_token)
+    model = AutoModelForCausalLM.from_pretrained(
+        repo_id,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto",
+        use_auth_token=hf_token,
+    )
+    return model, tokenizer
+
+
+def input_fn(input_data, content_type):
+    """Parse input data payload."""
+    if content_type == "application/json":
+        data = json.loads(input_data)
+        return pd.DataFrame(data)
+    else:
+        raise ValueError(f"Unsupported content type: {content_type}")
+
+
+def predict_fn(input_data, model_and_tokenizer):
+    """Make prediction using the input data."""
+    model, tokenizer = model_and_tokenizer
+    return run_sentiment_analysis(input_data, model, tokenizer)
+
+
+def output_fn(prediction, accept):
+    """Format prediction output."""
+    if accept == "application/json":
+        return json.dumps(prediction.to_dict(orient="records"))
+    raise ValueError(f"Unsupported accept type: {accept}")
diff --git a/esgtools/sentiment/inference_pipeline/preprocessing.py b/esgtools/sentiment/inference_pipeline/preprocessing.py
new file mode 100644
index 0000000..14155b4
--- /dev/null
+++ b/esgtools/sentiment/inference_pipeline/preprocessing.py
@@ -0,0 +1,97 @@
+import os
+import sys
+import json
+import argparse
+import logging
+import traceback
+from ast import literal_eval
+
+import pandas as pd
+from esgtools.utils import aws, sql_manager
+from esgtools.domain_models.io import convert_dict_to_sql_params
+
+# Create output directory if it doesn't exist
+os.makedirs('/opt/ml/processing/output', exist_ok=True)
+
+# Configure logging to write to stdout
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# Clear any existing handlers
+logger.handlers = []
+
+# Create console handler with a higher log level
+console_handler = logging.StreamHandler(sys.stdout)
+console_handler.setLevel(logging.INFO)
+console_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+console_handler.setFormatter(console_formatter)
+logger.addHandler(console_handler)
+
+def parse_args():
+    try:
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--region', type=str, default=os.environ.get('AWS_DEFAULT_REGION'))
+        args = parser.parse_args()
+        logger.info(f"Args parsed successfully: {vars(args)}")
+        return args
+    except Exception as e:
+        logger.error(f"Error parsing args: {str(e)}")
+        raise
+
+def fetch_and_prepare_data(region):
+    try:
+        logger.info("Starting fetch_and_prepare_data")
+        logger.info(f"Python version: {sys.version}")
+        logger.info(f"Current directory: {os.getcwd()}")
+        logger.info(f"Directory contents: {os.listdir('.')}")
+        logger.info(f"PYTHONPATH: {os.environ.get('PYTHONPATH', 'Not set')}")
+        
+        # Get database credentials from Secrets Manager
+        logger.info("Fetching database credentials")
+        sql_params = convert_dict_to_sql_params(literal_eval(aws.get_secret("prod/awsportfolio/key")))
+        logger.info("Successfully retrieved and converted database credentials")
+        
+        # Connect to database and fetch data
+        logger.info("Connecting to database")
+        sql = sql_manager.ManagerSQL(sql_params)
+        
+        year_month = "200605"
+        
+        logger.info(f"Fetching data for year_month: {year_month}")
+        query = f"""
+        SELECT headline, snippet 
+        FROM nyt_archive 
+        WHERE year_month = '{year_month}'
+        """
+        nyt_data = sql.select_query(query)
+        nyt_data = nyt_data.head()
+        logger.info(f"Retrieved {len(nyt_data)} records from database")
+        
+        inference_data = nyt_data[['headline', 'snippet']]
+        
+        # Save to output location as JSONL
+        output_path = os.path.join('/opt/ml/processing/output', "data.jsonl")
+        records_written = 0
+        
+        with open(output_path, 'w') as f:
+            for _, row in inference_data.iterrows():
+                f.write(json.dumps(row.to_dict()) + '\n')
+                records_written += 1
+                
+        logger.info(f"Successfully wrote {records_written} records to {output_path}")
+        
+    except Exception as e:
+        logger.error(f"Error in fetch_and_prepare_data: {str(e)}")
+        logger.error(f"Full traceback: {traceback.format_exc()}")
+        raise
+
+if __name__ == "__main__":
+    try:
+        logger.info("Script started")
+        args = parse_args()
+        logger.info(f"Starting preprocessing script in region: {args.region}")
+        fetch_and_prepare_data(args.region)
+        logger.info("Preprocessing completed successfully")
+    except Exception as e:
+        logger.error(f"Fatal error: {str(e)}")
+        sys.exit(1)
\ No newline at end of file
diff --git a/esgtools/sentiment/inference_pipeline/requirements.txt b/esgtools/sentiment/inference_pipeline/requirements.txt
new file mode 100644
index 0000000..09ff767
--- /dev/null
+++ b/esgtools/sentiment/inference_pipeline/requirements.txt
@@ -0,0 +1,8 @@
+transformers
+torch
+accelerate>=0.26.0
+s3fs
+boto3
+psycopg2-binary
+pandas
+tqdm
\ No newline at end of file
diff --git a/esgtools/sentiment/inference_pipeline/sagemaker_pipeline.py b/esgtools/sentiment/inference_pipeline/sagemaker_pipeline.py
new file mode 100644
index 0000000..fe558cd
--- /dev/null
+++ b/esgtools/sentiment/inference_pipeline/sagemaker_pipeline.py
@@ -0,0 +1,99 @@
+import os
+
+import boto3
+import sagemaker
+from sagemaker.inputs import TransformInput
+from sagemaker.processing import (ProcessingInput, ProcessingOutput,
+                                  ScriptProcessor)
+from sagemaker.transformer import Transformer
+from sagemaker.workflow.pipeline import Pipeline
+from sagemaker.workflow.steps import ProcessingStep, TransformStep
+
+
+def create_pipeline(
+    role_arn,
+    pipeline_name="SentimentAnalysisPipeline",
+    processing_instance_type="ml.m5.xlarge",
+    transform_instance_type="ml.g4dn.xlarge",
+    transform_instance_count=1,
+):
+    """Create a SageMaker pipeline for data processing and batch inference."""
+
+    session = sagemaker.Session()
+
+    # Get the absolute path to preprocessing.py
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    preprocessing_script = os.path.join(current_dir, "preprocessing.py")
+
+    # Define S3 paths
+    processing_output_path = f"s3://{session.default_bucket()}/sentiment/data"
+    transform_output_path = f"s3://{session.default_bucket()}/sentiment/output"
+
+    # Get the AWS account ID and region
+    account_id = boto3.client("sts").get_caller_identity().get("Account")
+    region = session.boto_region_name
+
+    # Use SageMaker's processing container
+    processing_image = (
+        f"{account_id}.dkr.ecr.{region}.amazonaws.com/sentiment-inference:latest"
+    )
+
+    # Data Processing Step
+    processor = ScriptProcessor(
+        command=["python3"],
+        image_uri=processing_image,
+        role=role_arn,
+        instance_count=1,
+        instance_type=processing_instance_type,
+        base_job_name="sentiment-preprocess",
+        env={
+            "AWS_DEFAULT_REGION": region,
+            "PYTHONPATH": "/opt/ml/code",
+            "PYTHONUNBUFFERED": "1",
+        },
+        volume_size_in_gb=30,
+        max_runtime_in_seconds=1200,
+        sagemaker_session=session,
+    )
+
+    # Configure processing step
+    processing_step = ProcessingStep(
+        name="PreprocessData",
+        processor=processor,
+        outputs=[
+            ProcessingOutput(
+                output_name="data",
+                source="/opt/ml/processing/output",
+                destination=processing_output_path,
+            )
+        ],
+        code=preprocessing_script,
+        job_arguments=["--region", region],
+    )
+
+    # Batch Transform Step
+    # transformer = Transformer(
+    #     model_name="sentiment-analysis-model",
+    #     instance_count=transform_instance_count,
+    #     instance_type=transform_instance_type,
+    #     output_path=transform_output_path,
+    #     sagemaker_session=session,
+    # )
+
+    # transform_step = TransformStep(
+    #     name="SentimentAnalysis",
+    #     transformer=transformer,
+    #     inputs=TransformInput(
+    #         data=processing_output_path + "/data.jsonl",
+    #         content_type="application/jsonlines",
+    #     ),
+    # )
+
+    # Create and return the pipeline
+    pipeline = Pipeline(
+        name=pipeline_name,
+        steps=[processing_step],  # transform_step
+        sagemaker_session=session,
+    )
+
+    return pipeline
diff --git a/esgtools/sentiment/inference_pipeline/test_container.sh b/esgtools/sentiment/inference_pipeline/test_container.sh
new file mode 100755
index 0000000..c02b2e2
--- /dev/null
+++ b/esgtools/sentiment/inference_pipeline/test_container.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+# Directory setup
+REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../../" && pwd )"
+PIPELINE_DIR="${REPO_DIR}/esgtools/sentiment/inference_pipeline"
+
+# Create temporary test directories
+TEST_DIR="/tmp/sagemaker-test"
+mkdir -p "${TEST_DIR}/input/code"
+mkdir -p "${TEST_DIR}/output"
+
+# Copy files to test directory
+cp "${PIPELINE_DIR}/preprocessing.py" "${TEST_DIR}/input/code/"
+
+echo "Creating test environment..."
+echo "Test directory contents:"
+ls -la "${TEST_DIR}"
+
+# Run the container locally with the same environment as SageMaker
+docker run --rm \
+  -v "${TEST_DIR}/input:/opt/ml/processing/input" \
+  -v "${TEST_DIR}/output:/opt/ml/processing/output" \
+  -v "${PIPELINE_DIR}/esgtools:/opt/ml/code/esgtools" \
+  -e AWS_REGION="$(aws configure get region)" \
+  -e AWS_ACCESS_KEY_ID="$(aws configure get aws_access_key_id)" \
+  -e AWS_SECRET_ACCESS_KEY="$(aws configure get aws_secret_access_key)" \
+  -e AWS_SESSION_TOKEN="$(aws configure get aws_session_token)" \
+  sentiment-inference \
+  python3 /opt/ml/processing/input/code/preprocessing.py --region $(aws configure get region)
+
+# Clean up
+rm -rf "${TEST_DIR}"
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 847b0b6..aac8c12 100644
--- a/setup.py
+++ b/setup.py
@@ -12,6 +12,7 @@ def read_requirements(filename):
 
 if __name__ == "__main__":
     setup(
+        name="esgtools",
         packages=find_packages(include=['esgtools', 'esgtools.*']),
         install_requires=read_requirements("lambda_requirements.txt")
     )
\ No newline at end of file