Skip to content

Commit

Permalink
Benchmark profiles and multi-turn conversations (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugoch authored Jan 28, 2025
1 parent 1b243ed commit 2355622
Show file tree
Hide file tree
Showing 15 changed files with 725 additions and 199 deletions.
27 changes: 9 additions & 18 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
name: Build and push docker image to internal registry
name: Build and push docker image to registry

on:
workflow_call:
push:
branches:
- 'main'
tags:
- 'v*'
pull_request:
branches:
- "main"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.ref }}-build
cancel-in-progress: true
jobs:
build-and-push:
Expand All @@ -31,7 +23,6 @@ jobs:
install: true
buildkitd-config: /tmp/buildkitd.toml
- name: Login to GitHub Container Registry
if: github.event_name != 'pull_request'
uses: docker/login-action@v3
with:
registry: ghcr.io
Expand All @@ -44,9 +35,9 @@ jobs:
uses: docker/metadata-action@v5
with:
images: |
registry.internal.huggingface.tech/api-inference/inference-benchmarker
ghcr.io/huggingface/inference-benchmarker
tags: |
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
# If main, release or tag
- name: Extract metadata (tags, labels) for Docker
if: ${{ github.event_name != 'pull_request' }}
Expand All @@ -58,10 +49,10 @@ jobs:
images: |
ghcr.io/huggingface/inference-benchmarker
tags: |
type=semver,pattern={{version}}${{ env.LABEL }}
type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }}
type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
Expand All @@ -72,7 +63,7 @@ jobs:
platforms: 'linux/amd64'
build-args: |
GIT_SHA=${{ env.GITHUB_SHA }}
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
PLATFORM=${{ env.PLATFORM }}
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
32 changes: 32 additions & 0 deletions .github/workflows/build_ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Build inference-benchmarker

on:
workflow_dispatch:
workflow_call:
push:
branches:
- 'main'
tags:
- 'v*'
pull_request:
branches:
- "main"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
rust:
name: Rust checks
permissions:
pull-requests: write
contents: read
uses: ./.github/workflows/rust.yaml
build:
permissions:
packages: write
contents: read
name: Build and push docker image
uses: ./.github/workflows/build.yaml
needs: rust
14 changes: 3 additions & 11 deletions .github/workflows/rust.yml → .github/workflows/rust.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
name: Rust checks

on:
workflow_dispatch:
push:
branches:
- 'main'
tags:
- 'v*'
pull_request:
paths:
- 'src/**'
- .github/workflows/rust.yml
workflow_call:

name: Rust checks
permissions:
pull-requests: write
contents: read
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
name: Secret Leaks

on:
push:

name: Secret Leaks

jobs:
trufflehog:
runs-on:
Expand Down
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ serde_with = "3.9.0"
sysinfo = "0.31.4"
mockito = "1.5.0"
tabled = "=0.14"
uuid = { version = "1.11.0", features = ["v4", "fast-rng"] }

[build-dependencies]
vergen-gitcl = { version = "1.0.1" }
vergen-gitcl = { version = "1.0.1" }
27 changes: 22 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ It can be used to benchmark any text generation server that exposes an OpenAI-co
* Broad Compatibility: Benchmarks any text generation server with an OpenAPI-compliant chat API.
* Automatic Sweep Mode: Detects maximum throughput and sweeps in-between.
* Open-Loop Benchmarking: Uses constant arrival rates to simulate real-world workloads.
* Benchmark profiles: Presets to benchmark for different model use cases (eg. chat, summarization, code completion...).
* High-Performance: Built with Rust 🦀 for high-performance benchmarking.
* JSON Output: Delivers performance results in a structured, easy-to-analyze format.

Expand All @@ -34,6 +35,7 @@ It can be used to benchmark any text generation server that exposes an OpenAI-co
* [1. Start an inference server](#1-start-an-inference-server)
* [2. Run a benchmark using Docker image](#2-run-a-benchmark-using-docker-image)
* [Configure your benchmark](#configure-your-benchmark)
* [Profiles](#profiles)
* [Benchmark mode](#benchmark-mode)
* [Dataset configuration](#dataset-configuration)
* [Prompt configuration](#prompt-configuration)
Expand Down Expand Up @@ -79,6 +81,7 @@ docker run --runtime nvidia --gpus all \
```shell
MODEL=meta-llama/Llama-3.1-8B-Instruct
HF_TOKEN=<your HF READ token>
# run a benchmark to evaluate the performance of the model for chat use case
# we mount results to the current directory
$ docker run \
--rm \
Expand All @@ -89,18 +92,32 @@ $ docker run \
ghcr.io/huggingface/inference-benchmarker:latest \
inference-benchmarker \
--tokenizer-name "$MODEL" \
--max-vus 800 \
--url http://localhost:8080 \
--warmup 20s \
--num-rates 10 \
--prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
--decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10"
--profile chat
```

Results will be saved in JSON format in current directory.

### Configure your benchmark

#### Profiles

Profiles are presets to benchmark for different model use cases. Available profiles:
- `chat`
Simulates a multi-turn chat scenario in which the model answers to successive user prompts.
The model is prompted with the whole conversation history at each turn. Prefix caching will have a significant impact
on the performance of this benchmark.
- `code-generation`
Simulates code-complete scenarios. Model is given large code snippets and asked to complete them with a few tokens
(e.g. a function name, a few code lines).
- `classification`
Simulates cases where the model is fed with large chunks of business data or document repeatedly and users
ask simple questions about the content (summarization, classification...).
Those use cases benefit a lot from prefix caching and chunked prefill.
- `fixed-length`
Model is sent fixed-length prompts to void the impact of variable-length tokenization on the benchmark.
This is a technical benchmark to evaluate the raw throughput of the model.

#### Benchmark mode

In default mode, tool runs a `sweep` benchmark. It first runs a throughput test to find the maximum throughput, then
Expand Down
3 changes: 2 additions & 1 deletion src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,8 @@ impl Widget for &App {
Some(_) => "Manual".to_string(),
};
let config_text = Text::from(vec![Line::from(vec![
format!("Benchmark: {kind} | Max VUs: {max_vus} | Duration: {duration} sec | Rates: {rates} | Warmup: {warmup} sec",
format!("Profile: {profile} | Benchmark: {kind} | Max VUs: {max_vus} | Duration: {duration} sec | Rates: {rates} | Warmup: {warmup} sec",
profile = self.benchmark_config.profile.clone().unwrap_or("N/A".to_string()),
kind = self.benchmark_config.benchmark_kind,
max_vus = self.benchmark_config.max_vus,
duration = self.benchmark_config.duration.as_secs_f64(),
Expand Down
2 changes: 2 additions & 0 deletions src/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ pub struct BenchmarkConfig {
pub prompt_options: Option<TokenizeOptions>,
pub decode_options: Option<TokenizeOptions>,
pub tokenizer: String,
pub profile: Option<String>,
#[serde(rename = "meta")]
pub extra_metadata: Option<HashMap<String, String>>,
}
Expand Down Expand Up @@ -439,6 +440,7 @@ mod tests {
prompt_options: None,
decode_options: None,
tokenizer: "gpt2".to_string(),
profile: None,
extra_metadata: None,
},
backend,
Expand Down
20 changes: 19 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::sync::Arc;
pub use crate::app::run_console;
pub use crate::benchmark::{BenchmarkConfig, BenchmarkKind};
use crate::benchmark::{Event, MessageEvent};
pub use crate::profiles::apply_profile;
use crate::requests::OpenAITextGenerationBackend;
pub use crate::requests::TokenizeOptions;
use chrono::Local;
Expand All @@ -23,6 +24,7 @@ mod benchmark;
mod event;
mod executors;
mod flux;
mod profiles;
mod requests;
mod results;
mod scheduler;
Expand All @@ -32,6 +34,7 @@ mod writers;
pub struct RunConfiguration {
pub url: String,
pub tokenizer_name: String,
pub profile: Option<String>,
pub max_vus: u64,
pub duration: std::time::Duration,
pub rates: Option<Vec<f64>>,
Expand All @@ -48,10 +51,24 @@ pub struct RunConfiguration {
pub model_name: String,
}

pub async fn run(run_config: RunConfiguration, stop_sender: Sender<()>) -> anyhow::Result<()> {
pub async fn run(mut run_config: RunConfiguration, stop_sender: Sender<()>) -> anyhow::Result<()> {
info!("Starting benchmark");
// set process system limits
sysinfo::set_open_files_limit(0);
// apply profile if needed
run_config = match run_config.profile.clone() {
None => run_config,
Some(profile) => match apply_profile(profile.as_str(), run_config) {
Ok(config) => {
info!("Profile applied: {}", profile);
config
}
Err(e) => {
error!("Failed to apply profile: {:?}", e);
return Err(e);
}
},
};
// initialize tokenizer
let params = FromPretrainedParameters {
token: run_config.hf_token.clone(),
Expand Down Expand Up @@ -88,6 +105,7 @@ pub async fn run(run_config: RunConfiguration, stop_sender: Sender<()>) -> anyho
prompt_options: run_config.prompt_options.clone(),
decode_options: run_config.decode_options.clone(),
tokenizer: run_config.tokenizer_name.clone(),
profile: run_config.profile.clone(),
extra_metadata: run_config.extra_metadata.clone(),
};
config.validate()?;
Expand Down
Loading

0 comments on commit 2355622

Please sign in to comment.