udaij12 · udaij12 · Jul 23, 2024 · Jul 31, 2024 · Aug 1, 2024 · Aug 2, 2024
diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
@@ -55,7 +55,7 @@ jobs:
           NEURON_RT_NUM_CORES: 1
         run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_neuronx.yaml --skip false
       - name: Save benchmark artifacts
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: nightly ${{ matrix.hardware }} artifact
           path: /tmp/ts_benchmark
@@ -72,7 +72,7 @@ jobs:
       - name: Update benchmark artifacts for auto validation
         run: python benchmarks/utils/update_artifacts.py --output /tmp/ts_artifacts/${{ matrix.hardware }}_benchmark_validation
       - name: Upload the updated benchmark artifacts for auto validation
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ matrix.hardware }}_benchmark_validation
           path: /tmp/ts_artifacts

diff --git a/.github/workflows/ci_graviton_cpu.yml b/.github/workflows/ci_graviton_cpu.yml
@@ -0,0 +1,48 @@
+name: CI CPU Graviton
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+  merge_group:
+
+
+concurrency:
+  group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  ci-cpu:
+    runs-on: [self-hosted, graviton-test]
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          architecture: arm64
+      - name: Setup Java 17
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'zulu'
+          java-version: '17'
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: Install dependencies
+        run: |
+          python ts_scripts/install_dependencies.py --environment=dev
+      - name: Torchserve Sanity
+        uses: nick-fields/retry@v3
+        env:
+          TS_MAC_ARM64_CPU_ONLY: 'True'
+        with:
+          timeout_minutes: 60
+          max_attempts: 3
+          retry_on: error
+          command: |
+            python torchserve_sanity.py
diff --git a/.github/workflows/docker-ci.yaml b/.github/workflows/docker-ci.yaml
@@ -39,7 +39,11 @@ jobs:
         working-directory: docker
         run: |
           IMAGE_TAG=test-image-${{ matrix.python-version }}
-          ./build_image.sh -py "${{ matrix.python-version }}" -t "${IMAGE_TAG}" -b ${{ steps.branch-name.outputs.GITHUB_BRANCH }} -s
+          REPO_URL="${{ github.event.pull_request.head.repo.clone_url }}"
+          if [[ -z "${REPO_URL}" ]]; then
+            REPO_URL="https://github.com/pytorch/serve.git"
+          fi
+          ./build_image.sh -py "${{ matrix.python-version }}" -t "${IMAGE_TAG}" -b "${{ steps.branch-name.outputs.GITHUB_BRANCH }}" -repo ${REPO_URL} -s
           echo "IMAGE_TAG=${IMAGE_TAG}" >> $GITHUB_OUTPUT
 
       - name: Container Healthcheck

diff --git a/.github/workflows/docker-nightly-build.yml b/.github/workflows/docker-nightly-build.yml
@@ -1,10 +1,11 @@
 name: Push Docker Nightly
 
 on:
-  # run every day at 1:15pm
+  # Run every day at 1:15pm
   schedule:
     - cron: "15 13 * * *"
   workflow_dispatch:
+
 jobs:
   nightly:
     runs-on: [self-hosted, ci-gpu]
@@ -32,12 +33,14 @@ jobs:
       - name: Push Docker Nightly
         run: |
           cd docker
+          sudo apt-get update
+          docker buildx use multibuilder
           python docker_nightly.py --cleanup
       - name: Push KServe Docker Nightly
         run: |
           cd kubernetes/kserve
+          docker buildx use multibuilder
           python docker_nightly.py --cleanup
-
       - name: Open issue on failure
         if: ${{ failure() && github.event_name  == 'schedule' }}
         uses: dacbd/create-issue-action@v1

diff --git a/.github/workflows/kserve_cpu_tests.yml b/.github/workflows/kserve_cpu_tests.yml
@@ -42,4 +42,4 @@ jobs:
           ref: v0.12.1
           path: kserve
       - name: Validate torchserve-kfs and Open Inference Protocol
-        run: ./kubernetes/kserve/tests/scripts/test_mnist.sh
+        run: ./kubernetes/kserve/tests/scripts/test_mnist.sh cpu
diff --git a/.github/workflows/kserve_gpu_tests.yml b/.github/workflows/kserve_gpu_tests.yml
@@ -0,0 +1,45 @@
+name: KServe GPU Nightly Tests
+
+on:
+  workflow_dispatch:
+  # runs everyday  at 5:15am
+  schedule:
+    - cron:  '15 5 * * *'
+
+jobs:
+  kserve-gpu-tests:
+    runs-on: [self-hosted, regression-test-gpu]
+    steps:
+      - name: Clean up previous run
+        run: |
+          echo "Cleaning up previous run"
+          ls -la ./
+          sudo rm -rf ./* || true
+          sudo rm -rf ./.??* || true
+          ls -la ./
+      - name: Install minikube and kubectl
+        run: |
+          curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64
+          sudo install minikube-linux-amd64 /usr/local/bin/minikube
+          curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+          sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+          echo "/usr/local/bin" >> $GITHUB_PATH
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.9
+          architecture: x64
+      - name: Install grpcurl
+        run: |
+          sudo curl -sSL https://github.com/fullstorydev/grpcurl/releases/download/v1.8.0/grpcurl_1.8.0_linux_x86_64.tar.gz | sudo tar -xz -C /usr/local/bin grpcurl
+          sudo chmod +x /usr/local/bin/grpcurl
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+      - name: Checkout kserve repo
+        uses: actions/checkout@v4
+        with:
+          repository: kserve/kserve
+          ref: v0.12.1
+          path: kserve
+      - name: Validate torchserve-kfs and Open Inference Protocol
+        run: ./kubernetes/kserve/tests/scripts/test_mnist.sh gpu
diff --git a/.github/workflows/official_release_docker.yml b/.github/workflows/official_release_docker.yml
@@ -42,9 +42,11 @@ jobs:
         if: github.event.inputs.upload_docker == 'yes'
         run: |
           cd docker
+          docker buildx use multibuilder
           python build_upload_release.py --cleanup
       - name: Build & Upload pytorch/torchserve-kfs Docker images
         if: github.event.inputs.upload_kfs == 'yes'
         run: |
           cd kubernetes/kserve
+          docker buildx use multibuilder
           python build_upload_release.py --cleanup
diff --git a/.github/workflows/regression_tests_graviton_cpu.yml b/.github/workflows/regression_tests_graviton_cpu.yml
@@ -0,0 +1,41 @@
+name: Run Regression Tests on CPU for Graviton
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+  merge_group:
+
+concurrency:
+  group: ci-cpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/master' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  regression-cpu:
+    runs-on: [self-hosted, graviton-test]
+    steps:
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          architecture: arm64
+      - name: Setup Java 17
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'zulu'
+          java-version: '17'
+      - name: Checkout TorchServe
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: Install dependencies
+        run: |
+          python ts_scripts/install_dependencies.py --environment=dev
+      - name: Torchserve Regression Tests
+        env:
+          TS_MAC_ARM64_CPU_ONLY: 'True'
+        run: |
+          python test/regression_tests.py
diff --git a/.gitignore b/.gitignore
@@ -30,6 +30,7 @@ test/model_store/
 test/ts_console.log
 test/config.properties
 
+model-store-local/
 
 .vscode
 .scratch/
@@ -45,3 +46,9 @@ instances.yaml.backup
 # cpp
 cpp/_build
 cpp/third-party
+
+# projects
+.tool-versions
+**/*/.classpath
+**/*/.settings
+**/*/.project
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -11,18 +11,7 @@ Your contributions will fall into two categories:
     - Search for your issue here: https://github.com/pytorch/serve/issues (look for the "good first issue" tag if you're a first time contributor)
     - Pick an issue and comment on the task that you want to work on this feature.
     - To ensure your changes doesn't break any of the existing features run the sanity suite as follows from serve directory:
-        - Install dependencies (if not already installed)
-          For CPU
-
-          ```bash
-          python ts_scripts/install_dependencies.py --environment=dev
-          ```
-
-         For GPU
-           ```bash
-           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121
-           ```
-            > Supported cuda versions as cu121, cu118, cu117, cu116, cu113, cu111, cu102, cu101, cu92
+        - [Install dependencies](#Install-TorchServe-for-development) (if not already installed)
         - Install `pre-commit` to your Git flow:
             ```bash
             pre-commit install
@@ -60,26 +49,30 @@ pytest -k  test/pytest/test_mnist_template.py
 
 If you plan to develop with TorchServe and change some source code, you must install it from source code.
 
-Ensure that you have `python3` installed, and the user has access to the site-packages or `~/.local/bin` is added to the `PATH` environment variable.
+1. Clone the repository, including third-party modules, with `git clone --recurse-submodules --remote-submodules [email protected]:pytorch/serve.git`
+2. Ensure that you have `python3` installed, and the user has access to the site-packages or `~/.local/bin` is added to the `PATH` environment variable.
+3. Run the following script from the top of the source directory. NOTE: This script force re-installs `torchserve`, `torch-model-archiver` and `torch-workflow-archiver` if existing installations are found
 
-Run the following script from the top of the source directory.
+    #### For Debian Based Systems/MacOS
 
-NOTE: This script force re-installs `torchserve`, `torch-model-archiver` and `torch-workflow-archiver` if existing installations are found
+    ```
+    python ./ts_scripts/install_dependencies.py --environment=dev
+    python ./ts_scripts/install_from_src.py --environment=dev
+    ```
+    ##### Installing Dependencies for Accelerator Support
+    Use the optional `--rocm` or `--cuda` flag with `install_dependencies.py` for installing accelerator specific dependencies.
 
-#### For Debian Based Systems/ MacOS
-
-```
-python ./ts_scripts/install_dependencies.py --environment=dev
-python ./ts_scripts/install_from_src.py --environment=dev
-```
+    Possible values are
+    - rocm: `rocm61`, `rocm60`
+    - cuda: `cu111`, `cu102`, `cu101`, `cu92`
 
-Use `--cuda` flag with `install_dependencies.py` for installing cuda version specific dependencies. Possible values are `cu111`, `cu102`, `cu101`, `cu92`
+    For example `python ./ts_scripts/install_dependencies.py --environment=dev --rocm=rocm61`
 
-#### For Windows
+    #### For Windows
 
-Refer to the documentation [here](docs/torchserve_on_win_native.md).
+    Refer to the documentation [here](docs/torchserve_on_win_native.md).
 
-For information about the model archiver, see [detailed documentation](model-archiver/README.md).
+    For information about the model archiver, see [detailed documentation](model-archiver/README.md).
 
 ### What to Contribute?
 

diff --git a/README.md b/README.md
@@ -22,7 +22,10 @@ curl http://127.0.0.1:8080/predictions/bert -T input.txt
 
 ```bash
 # Install dependencies
-# cuda is optional
+python ./ts_scripts/install_dependencies.py
+
+# Include dependencies for accelerator support with the relevant optional flags
+python ./ts_scripts/install_dependencies.py --rocm=rocm61
 python ./ts_scripts/install_dependencies.py --cuda=cu121
 
 # Latest release
@@ -36,7 +39,10 @@ pip install torchserve-nightly torch-model-archiver-nightly torch-workflow-archi
 
 ```bash
 # Install dependencies
-# cuda is optional
+python ./ts_scripts/install_dependencies.py
+
+# Include depeendencies for accelerator support with the relevant optional flags
+python ./ts_scripts/install_dependencies.py --rocm=rocm61
 python ./ts_scripts/install_dependencies.py --cuda=cu121
 
 # Latest release
@@ -62,13 +68,35 @@ Refer to [torchserve docker](docker/README.md) for details.
 
 ### 🤖 Quick Start LLM Deployment
 
+#### VLLM Engine
+```bash
+# Make sure to install torchserve with pip or conda as described above and login with `huggingface-cli login`
+python -m ts.llm_launcher --model_id meta-llama/Llama-3.2-3B-Instruct --disable_token_auth
+
+# Try it out
+curl -X POST -d '{"model":"meta-llama/Llama-3.2-3B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
+```
+
+#### TRT-LLM Engine
+```bash
+# Make sure to install torchserve with python venv as described above and login with `huggingface-cli login`
+# pip install -U --use-deprecated=legacy-resolver -r requirements/trt_llm.txt
+python -m ts.llm_launcher --model_id meta-llama/Meta-Llama-3.1-8B-Instruct --engine trt_llm --disable_token_auth
+
+# Try it out
+curl -X POST -d '{"prompt":"count from 1 to 9 in french ", "max_tokens": 100}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
+```
+
+### 🚢 Quick Start LLM Deployment with Docker
+
 ```bash
 #export token=<HUGGINGFACE_HUB_TOKEN>
-docker build --pull . -f docker/Dockerfile.llm -t ts/llm
+docker build --pull . -f docker/Dockerfile.vllm -t ts/vllm
 
-docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/llm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
+docker run --rm -ti --shm-size 10g --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:8080 -v data:/data ts/vllm --model_id meta-llama/Meta-Llama-3-8B-Instruct --disable_token_auth
 
-curl -X POST -d '{"prompt":"Hello, my name is", "max_new_tokens": 50}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model"
+# Try it out
+curl -X POST -d '{"model":"meta-llama/Meta-Llama-3-8B-Instruct", "prompt":"Hello, my name is", "max_tokens": 200}' --header "Content-Type: application/json" "http://localhost:8080/predictions/model/1.0/v1/completions"
 ```
 
 Refer to [LLM deployment](docs/llm_deployment.md) for details and other methods.

diff --git a/benchmarks/utils/system_under_test.py b/benchmarks/utils/system_under_test.py
@@ -113,6 +113,7 @@ def start(self):
         execute("torchserve --stop", wait=True)
         click.secho("*Setting up model store...", fg="green")
         self._prepare_local_dependency()
+        self._clear_neuron_cache_if_exists()
         click.secho("*Starting local Torchserve instance...", fg="green")
 
         ts_cmd = (
@@ -141,6 +142,31 @@ def start(self):
                 if "Model server started" in str(line).strip():
                     break
 
+    def _clear_neuron_cache_if_exists(self):
+        cache_dir = "/var/tmp/neuron-compile-cache/"
+
+        # Check if the directory exists
+        if os.path.exists(cache_dir) and os.path.isdir(cache_dir):
+            click.secho(
+                f"Directory {cache_dir} exists. Clearing contents...", fg="green"
+            )
+
+            # Remove the directory contents
+            for filename in os.listdir(cache_dir):
+                file_path = os.path.join(cache_dir, filename)
+                try:
+                    if os.path.isfile(file_path) or os.path.islink(file_path):
+                        os.unlink(file_path)
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                except Exception as e:
+                    click.secho(f"Failed to delete {file_path}. Reason: {e}", fg="red")
+            click.secho(f"Cache cleared: {cache_dir}", fg="green")
+        else:
+            click.secho(
+                f"Directory {cache_dir} does not exist. No action taken.", fg="green"
+            )
+
     def stop(self):
         click.secho("*Terminating Torchserve instance...", fg="green")
         execute("torchserve --stop", wait=True)