Skip to content

Commit

Permalink
Create nightly job for torch.compile benchmarks (#2835)
Browse files Browse the repository at this point in the history
* Create nightly job for torch.compile benchmarks

* Run with torch nightly

* Add 300 second timeout in ab command

* Retrigger tests

* Add support for nightly PyTorch in auto_benchmark.py

* Retrigger tests

* Retrigger tests

* Retrigger tests

* Retrigger tests

* Retrigger tests

* Retrigger tests

* Retrigger tests

* Retrigger tests

* Retrigger tests

* Retrigger tests

* Remove auto-validation of benchmark results from workflow

* Retrigger tests

* Revert changes in auto_benchmark.py and remove push trigger in workflow

---------

Co-authored-by: Ubuntu <[email protected]>
  • Loading branch information
sachanub and Ubuntu authored Dec 9, 2023
1 parent b368468 commit 356704a
Show file tree
Hide file tree
Showing 7 changed files with 237 additions and 8 deletions.
43 changes: 43 additions & 0 deletions .github/workflows/benchmark_torch_compile_nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Benchmark torch.compile models nightly

on:
# run every day at 9:15pm
schedule:
- cron: '15 21 * * *'

jobs:
nightly:
strategy:
fail-fast: false
runs-on: [self-hosted, gpu]
timeout-minutes: 1320
steps:
- name: Clean up previous run
run: |
echo "Cleaning up previous run"
cd $RUNNER_WORKSPACE
pwd
cd ..
pwd
rm -rf _tool
- name: Setup Python 3.8
uses: actions/setup-python@v4
with:
python-version: 3.8
architecture: x64
- name: Setup Java 17
uses: actions/setup-java@v3
with:
distribution: 'zulu'
java-version: '17'
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install dependencies
run: |
sudo apt-get update -y
sudo apt-get install -y apache2-utils
pip install -r benchmarks/requirements-ab.txt
- name: Benchmark gpu nightly
run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_torch_compile_gpu.yaml --skip false --nightly True
24 changes: 18 additions & 6 deletions benchmarks/auto_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,16 @@ def load_benchmark_config(bm_config_path, skip_ts_install, skip_upload):
return benchmark_config.bm_config


def benchmark_env_setup(bm_config, skip_ts_install):
install_torchserve(skip_ts_install, bm_config["hardware"], bm_config["version"])
def benchmark_env_setup(bm_config, skip_ts_install, nightly):
install_torchserve(
skip_ts_install, bm_config["hardware"], bm_config["version"], nightly
)
setup_benchmark_path(bm_config["model_config_path"])
build_model_json_config(bm_config["models"])
enable_launcher_with_logical_core(bm_config["hardware"])


def install_torchserve(skip_ts_install, hw, ts_version):
def install_torchserve(skip_ts_install, hw, ts_version, nightly):
if skip_ts_install:
return

Expand All @@ -154,6 +156,8 @@ def install_torchserve(skip_ts_install, hw, ts_version):
cmd = "python ts_scripts/install_dependencies.py --environment dev --neuronx"
else:
cmd = "python ts_scripts/install_dependencies.py --environment dev"
if nightly:
cmd += " --nightly_torch"
execute(cmd, wait=True)
print("successfully install install_dependencies.py")

Expand Down Expand Up @@ -290,9 +294,12 @@ def main():
)
parser.add_argument(
"--skip_upload",
help="true: skip uploading commands . default: false",
help="true: skip uploading commands. default: false",
)
parser.add_argument(
"--nightly",
help="true: install nightly version of torch package. default: false",
)

arguments = parser.parse_args()
skip_ts_config = (
False
Expand All @@ -304,8 +311,13 @@ def main():
if arguments.skip_upload is not None and arguments.skip_upload.lower() == "true"
else False
)
nightly = (
True
if arguments.nightly is not None and arguments.nightly.lower() == "true"
else False
)
bm_config = load_benchmark_config(arguments.input, skip_ts_config, skip_upload)
benchmark_env_setup(bm_config, skip_ts_config)
benchmark_env_setup(bm_config, skip_ts_config, nightly)
run_benchmark(bm_config)
clean_up_benchmark_env(bm_config)
print("benchmark_serving.sh finished successfully.")
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benchmark-ab.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def warm_up():
click.secho("\n\nExecuting warm-up ...", fg="green")

ab_cmd = (
f"ab -c {execution_params['concurrency']} -n {execution_params['requests']/10} -k -p "
f"ab -c {execution_params['concurrency']} -s 300 -n {execution_params['requests']/10} -k -p "
f"{execution_params['tmp_dir']}/benchmark/input -T {execution_params['content_type']} "
f"{execution_params['inference_url']}/{execution_params['inference_model_url']} > "
f"{execution_params['result_file']}"
Expand All @@ -247,7 +247,7 @@ def run_benchmark():

click.secho("\n\nExecuting inference performance tests ...", fg="green")
ab_cmd = (
f"ab -c {execution_params['concurrency']} -n {execution_params['requests']} -k -p "
f"ab -c {execution_params['concurrency']} -s 300 -n {execution_params['requests']} -k -p "
f"{execution_params['tmp_dir']}/benchmark/input -T {execution_params['content_type']} "
f"{execution_params['inference_url']}/{execution_params['inference_model_url']} > "
f"{execution_params['result_file']}"
Expand Down
48 changes: 48 additions & 0 deletions benchmarks/benchmark_config_torch_compile_gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Torchserve version is to be installed. It can be one of the options
# - branch : "master"
# - nightly: "2022.3.16"
# - release: "0.5.3"
# Nightly build will be installed if "ts_version" is not specifiged
#ts_version:
# branch: &ts_version "master"

# a list of model configure yaml files defined in benchmarks/models_config
# or a list of model configure yaml files with full path
models:
- "bert_torch_compile_gpu.yaml"
- "resnet50_torch_compile_gpu.yaml"
- "vgg16_torch_compile_gpu.yaml"

# benchmark on "cpu" or "gpu".
# "cpu" is set if "hardware" is not specified
hardware: &hardware "gpu"

# load prometheus metrics report to remote storage or local different path if "metrics_cmd" is set.
# the command line to load prometheus metrics report to remote system.
# Here is an example of AWS cloudwatch command:
# Note:
# - keep the values order as the same as the command definition.
# - set up the command before enabling `metrics_cmd`.
# For example, aws client and AWS credentials need to be setup before trying this example.
metrics_cmd:
- "cmd": "aws cloudwatch put-metric-data"
- "--namespace": ["torchserve_benchmark_nightly_torch_compile_", *hardware]
- "--region": "us-east-2"
- "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json'

# load report to remote storage or local different path if "report_cmd" is set.
# the command line to load report to remote storage.
# Here is an example of AWS cloudwatch command:
# Note:
# - keep the values order as the same as the command.
# - set up the command before enabling `report_cmd`.
# For example, aws client, AWS credentials and S3 bucket
# need to be setup before trying this example.
# - "today()" is a keyword to apply current date in the path
# For example, the dest path in the following example is
# s3://torchserve-model-serving/benchmark/2022-03-18/gpu
report_cmd:
- "cmd": "aws s3 cp --recursive"
- "source": '/tmp/ts_benchmark/'
- "dest": ['s3://torchserve-benchmark/torch-compile-nightly', "today()", *hardware]

42 changes: 42 additions & 0 deletions benchmarks/models_config/bert_torch_compile_gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
bert:
scripted_mode:
benchmark_engine: "ab"
url: https://torchserve.pytorch.org/mar_files/bert-scripted.mar
workers:
- 4
batch_delay: 100
batch_size:
- 1
- 2
- 4
- 8
- 16
input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text_captum_input.txt"
requests: 50000
concurrency: 100
backend_profiling: False
exec_env: "local"
processors:
- "cpu"
- "gpus": "all"
torch_compile_default_mode:
benchmark_engine: "ab"
url: https://torchserve.pytorch.org/mar_files/bert-default.mar
workers:
- 4
batch_delay: 100
batch_size:
- 1
- 2
- 4
- 8
- 16
input: "./examples/Huggingface_Transformers/Seq_classification_artifacts/sample_text_captum_input.txt"
requests: 50000
concurrency: 100
backend_profiling: False
exec_env: "local"
processors:
- "cpu"
- "gpus": "all"
42 changes: 42 additions & 0 deletions benchmarks/models_config/resnet50_torch_compile_gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
resnet50:
scripted_mode:
benchmark_engine: "ab"
url: https://torchserve.pytorch.org/mar_files/resnet-50-scripted.mar
workers:
- 4
batch_delay: 100
batch_size:
- 1
- 2
- 4
- 8
- 16
input: "./examples/image_classifier/kitten.jpg"
requests: 10000
concurrency: 100
backend_profiling: False
exec_env: "local"
processors:
- "cpu"
- "gpus": "all"
torch_compile_default_mode:
benchmark_engine: "ab"
url: https://torchserve.pytorch.org/mar_files/resnet-50-default.mar
workers:
- 4
batch_delay: 100
batch_size:
- 1
- 2
- 4
- 8
- 16
input: "./examples/image_classifier/kitten.jpg"
requests: 10000
concurrency: 100
backend_profiling: False
exec_env: "local"
processors:
- "cpu"
- "gpus": "all"
42 changes: 42 additions & 0 deletions benchmarks/models_config/vgg16_torch_compile_gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
vgg16:
scripted_mode:
benchmark_engine: "ab"
url: https://torchserve.pytorch.org/mar_files/vgg-16-scripted.mar
workers:
- 4
batch_delay: 100
batch_size:
- 1
- 2
- 4
- 8
- 16
input: "./examples/image_classifier/kitten.jpg"
requests: 10000
concurrency: 100
backend_profiling: False
exec_env: "local"
processors:
- "cpu"
- "gpus": "all"
torch_compile_default_mode:
benchmark_engine: "ab"
url: https://torchserve.pytorch.org/mar_files/vgg-16-default.mar
workers:
- 4
batch_delay: 100
batch_size:
- 1
- 2
- 4
- 8
- 16
input: "./examples/image_classifier/kitten.jpg"
requests: 10000
concurrency: 100
backend_profiling: False
exec_env: "local"
processors:
- "cpu"
- "gpus": "all"

0 comments on commit 356704a

Please sign in to comment.