Benchmark profiles and multi-turn conversations (#1)

huggingface · Jan 28, 2025 · 2355622 · 2355622
1 parent 1b243ed
commit 2355622
Show file tree

Hide file tree

Showing 15 changed files with 725 additions and 199 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -1,17 +1,9 @@
-name: Build and push docker image to internal registry
+name: Build and push docker image to registry
 
 on:
   workflow_call:
-  push:
-    branches:
-      - 'main'
-    tags:
-      - 'v*'
-  pull_request:
-    branches:
-      - "main"
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-build
   cancel-in-progress: true
 jobs:
   build-and-push:
@@ -31,7 +23,6 @@ jobs:
           install: true
           buildkitd-config: /tmp/buildkitd.toml
       - name: Login to GitHub Container Registry
-        if: github.event_name != 'pull_request'
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
@@ -44,9 +35,9 @@ jobs:
         uses: docker/metadata-action@v5
         with:
           images: |
-            registry.internal.huggingface.tech/api-inference/inference-benchmarker
+            ghcr.io/huggingface/inference-benchmarker
           tags: |
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
       # If main, release or tag
       - name: Extract metadata (tags, labels) for Docker
         if: ${{ github.event_name != 'pull_request' }}
@@ -58,10 +49,10 @@ jobs:
           images: |
             ghcr.io/huggingface/inference-benchmarker
           tags: |
-            type=semver,pattern={{version}}${{ env.LABEL }}
-            type=semver,pattern={{major}}.{{minor}}${{ env.LABEL }}
-            type=raw,value=latest${{ env.LABEL }},enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
-            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
+            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
       - name: Build and push Docker image
         id: build-and-push
         uses: docker/build-push-action@v4
@@ -72,7 +63,7 @@ jobs:
           platforms: 'linux/amd64'
           build-args: |
             GIT_SHA=${{ env.GITHUB_SHA }}
-            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ env.LABEL }}
+            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
             PLATFORM=${{ env.PLATFORM }}
           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
diff --git a/.github/workflows/build_ci.yaml b/.github/workflows/build_ci.yaml
@@ -0,0 +1,32 @@
+name: Build inference-benchmarker
+
+on:
+  workflow_dispatch:
+  workflow_call:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - 'v*'
+  pull_request:
+    branches:
+      - "main"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  rust:
+    name: Rust checks
+    permissions:
+      pull-requests: write
+      contents: read
+    uses: ./.github/workflows/rust.yaml
+  build:
+    permissions:
+      packages: write
+      contents: read
+    name: Build and push docker image
+    uses: ./.github/workflows/build.yaml
+    needs: rust
diff --git a/.github/workflows/rust.yml → .github/workflows/rust.yaml b/.github/workflows/rust.yml → .github/workflows/rust.yaml
@@ -1,16 +1,8 @@
+name: Rust checks
+
 on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'main'
-    tags:
-      - 'v*'
-  pull_request:
-    paths:
-      - 'src/**'
-      - .github/workflows/rust.yml
+  workflow_call:
 
-name: Rust checks
 permissions:
   pull-requests: write
   contents: read

diff --git a/.github/workflows/trufflehog.yml → .github/workflows/trufflehog.yaml b/.github/workflows/trufflehog.yml → .github/workflows/trufflehog.yaml
@@ -1,8 +1,8 @@
+name: Secret Leaks
+
 on:
   push:
 
-name: Secret Leaks
-
 jobs:
   trufflehog:
     runs-on:

diff --git a/Cargo.toml b/Cargo.toml
@@ -34,6 +34,7 @@ serde_with = "3.9.0"
 sysinfo = "0.31.4"
 mockito = "1.5.0"
 tabled = "=0.14"
+uuid = { version = "1.11.0", features = ["v4", "fast-rng"] }
 
 [build-dependencies]
-vergen-gitcl = { version = "1.0.1" }
+vergen-gitcl = { version = "1.0.1" }
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ It can be used to benchmark any text generation server that exposes an OpenAI-co
 * Broad Compatibility: Benchmarks any text generation server with an OpenAPI-compliant chat API.
 * Automatic Sweep Mode: Detects maximum throughput and sweeps in-between.
 * Open-Loop Benchmarking: Uses constant arrival rates to simulate real-world workloads.
+* Benchmark profiles: Presets to benchmark for different model use cases (eg. chat, summarization, code completion...).
 * High-Performance: Built with Rust 🦀 for high-performance benchmarking.
 * JSON Output: Delivers performance results in a structured, easy-to-analyze format.
 
@@ -34,6 +35,7 @@ It can be used to benchmark any text generation server that exposes an OpenAI-co
       * [1. Start an inference server](#1-start-an-inference-server)
       * [2. Run a benchmark using Docker image](#2-run-a-benchmark-using-docker-image)
     * [Configure your benchmark](#configure-your-benchmark)
+      * [Profiles](#profiles)
       * [Benchmark mode](#benchmark-mode)
       * [Dataset configuration](#dataset-configuration)
       * [Prompt configuration](#prompt-configuration)
@@ -79,6 +81,7 @@ docker run --runtime nvidia --gpus all \
 ```shell
 MODEL=meta-llama/Llama-3.1-8B-Instruct
 HF_TOKEN=<your HF READ token>
+# run a benchmark to evaluate the performance of the model for chat use case
 # we mount results to the current directory
 $ docker run \
     --rm \
@@ -89,18 +92,32 @@ $ docker run \
     ghcr.io/huggingface/inference-benchmarker:latest \
     inference-benchmarker \
     --tokenizer-name "$MODEL" \
-    --max-vus 800 \
     --url http://localhost:8080 \
-    --warmup 20s \
-    --num-rates 10 \
-    --prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
-    --decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" 
+    --profile chat
 ```
 
 Results will be saved in JSON format in current directory.
 
 ### Configure your benchmark
 
+#### Profiles
+
+Profiles are presets to benchmark for different model use cases. Available profiles:
+- `chat`
+  Simulates a multi-turn chat scenario in which the model answers to successive user prompts.
+  The model is prompted with the whole conversation history at each turn. Prefix caching will have a significant impact
+  on the performance of this benchmark.
+- `code-generation`
+  Simulates code-complete scenarios. Model is given large code snippets and asked to complete them with a few tokens 
+  (e.g. a function name, a few code lines).
+- `classification`
+  Simulates cases where the model is fed with large chunks of business data or document repeatedly and users
+  ask simple questions about the content (summarization, classification...).
+  Those use cases benefit a lot from prefix caching and chunked prefill.
+- `fixed-length`
+  Model is sent fixed-length prompts to void the impact of variable-length tokenization on the benchmark.
+  This is a technical benchmark to evaluate the raw throughput of the model.
+
 #### Benchmark mode
 
 In default mode, tool runs a `sweep` benchmark. It first runs a throughput test to find the maximum throughput, then

diff --git a/src/app.rs b/src/app.rs
@@ -328,7 +328,8 @@ impl Widget for &App {
             Some(_) => "Manual".to_string(),
         };
         let config_text = Text::from(vec![Line::from(vec![
-            format!("Benchmark: {kind} | Max VUs: {max_vus} | Duration: {duration} sec | Rates: {rates} | Warmup: {warmup} sec",
+            format!("Profile: {profile} | Benchmark: {kind} | Max VUs: {max_vus} | Duration: {duration} sec | Rates: {rates} | Warmup: {warmup} sec",
+                    profile = self.benchmark_config.profile.clone().unwrap_or("N/A".to_string()),
                     kind = self.benchmark_config.benchmark_kind,
                     max_vus = self.benchmark_config.max_vus,
                     duration = self.benchmark_config.duration.as_secs_f64(),

diff --git a/src/benchmark.rs b/src/benchmark.rs
@@ -71,6 +71,7 @@ pub struct BenchmarkConfig {
     pub prompt_options: Option<TokenizeOptions>,
     pub decode_options: Option<TokenizeOptions>,
     pub tokenizer: String,
+    pub profile: Option<String>,
     #[serde(rename = "meta")]
     pub extra_metadata: Option<HashMap<String, String>>,
 }
@@ -439,6 +440,7 @@ mod tests {
                 prompt_options: None,
                 decode_options: None,
                 tokenizer: "gpt2".to_string(),
+                profile: None,
                 extra_metadata: None,
             },
             backend,

diff --git a/src/lib.rs b/src/lib.rs
@@ -8,6 +8,7 @@ use std::sync::Arc;
 pub use crate::app::run_console;
 pub use crate::benchmark::{BenchmarkConfig, BenchmarkKind};
 use crate::benchmark::{Event, MessageEvent};
+pub use crate::profiles::apply_profile;
 use crate::requests::OpenAITextGenerationBackend;
 pub use crate::requests::TokenizeOptions;
 use chrono::Local;
@@ -23,6 +24,7 @@ mod benchmark;
 mod event;
 mod executors;
 mod flux;
+mod profiles;
 mod requests;
 mod results;
 mod scheduler;
@@ -32,6 +34,7 @@ mod writers;
 pub struct RunConfiguration {
     pub url: String,
     pub tokenizer_name: String,
+    pub profile: Option<String>,
     pub max_vus: u64,
     pub duration: std::time::Duration,
     pub rates: Option<Vec<f64>>,
@@ -48,10 +51,24 @@ pub struct RunConfiguration {
     pub model_name: String,
 }
 
-pub async fn run(run_config: RunConfiguration, stop_sender: Sender<()>) -> anyhow::Result<()> {
+pub async fn run(mut run_config: RunConfiguration, stop_sender: Sender<()>) -> anyhow::Result<()> {
     info!("Starting benchmark");
     // set process system limits
     sysinfo::set_open_files_limit(0);
+    // apply profile if needed
+    run_config = match run_config.profile.clone() {
+        None => run_config,
+        Some(profile) => match apply_profile(profile.as_str(), run_config) {
+            Ok(config) => {
+                info!("Profile applied: {}", profile);
+                config
+            }
+            Err(e) => {
+                error!("Failed to apply profile: {:?}", e);
+                return Err(e);
+            }
+        },
+    };
     // initialize tokenizer
     let params = FromPretrainedParameters {
         token: run_config.hf_token.clone(),
@@ -88,6 +105,7 @@ pub async fn run(run_config: RunConfiguration, stop_sender: Sender<()>) -> anyho
         prompt_options: run_config.prompt_options.clone(),
         decode_options: run_config.decode_options.clone(),
         tokenizer: run_config.tokenizer_name.clone(),
+        profile: run_config.profile.clone(),
         extra_metadata: run_config.extra_metadata.clone(),
     };
     config.validate()?;