feat: Add results to console

huggingface · Oct 8, 2024 · 5519601 · 5519601
1 parent 6fe8735
commit 5519601
Show file tree

Hide file tree

Showing 8 changed files with 142 additions and 28 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -33,3 +33,4 @@ rayon = "1.10.0"
 serde_with = "3.9.0"
 sysinfo = "0.31.4"
 mockito = "1.5.0"
+tabled = "=0.14"
diff --git a/README.md b/README.md
@@ -1,17 +1,18 @@
 # TGI Benchmark: A High-Performance Tool for Text Generation Model Benchmarking
 
-Benchmarking inference servers for text generation models presents unique challenges. 
-The performance of these models can vary greatly depending on factors like input prompts, 
+Benchmarking inference servers for text generation models presents unique challenges.
+The performance of these models can vary greatly depending on factors like input prompts,
 decoding strategies, hardware specifications, and server configurations.
 
-**TGI Benchmark** is designed to streamline this process by providing a comprehensive benchmarking tool 
-that evaluates the real-world performance of text generation models and servers. 
-With **TGI Benchmark**, you can easily test your model's throughput and efficiency under various workloads, 
+**TGI Benchmark** is designed to streamline this process by providing a comprehensive benchmarking tool
+that evaluates the real-world performance of text generation models and servers.
+With **TGI Benchmark**, you can easily test your model's throughput and efficiency under various workloads,
 identify performance bottlenecks, and optimize your deployment for production environments.
 
 It can be used to benchmark any text generation server that exposes an OpenAI-compliant API.
 
 ## Features
+
 * Broad Compatibility: Benchmarks any text generation server with an OpenAPI-compliant API.
 * Automatic Sweep Mode: Detects maximum throughput and sweeps in-between.
 * Open-Loop Benchmarking: Uses constant arrival rates to simulate real-world workloads.
@@ -40,13 +41,14 @@ It can be used to benchmark any text generation server that exposes an OpenAI-co
   * [TODO](#todo)
 <!-- TOC -->
 
-
 ## Get started
 
 ### Run a benchmark
 
 #### 1. Start an inference server
+
 **TGI**
+
 ```bash
 MODEL=meta-llama/Llama-3.1-8B-Instruct
 HF_TOKEN=<your HF READ token>
@@ -56,6 +58,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -e "HF_TOKEN=$HF_TOKEN" \
 ```
 
 **vLLM**
+
 ```bash
 MODEL=meta-llama/Llama-3.1-8B-Instruct
 HF_TOKEN=<your HF READ token>
@@ -86,8 +89,8 @@ $ docker run \
     --url http://localhost:8080 \
     --warmup 20s \
     --num-rates 10 \
-    --prompt-options "num_tokens=50,max_tokens=60,min_tokens=40,variance=10" \
-    --decode-options "num_tokens=50,max_tokens=60,min_tokens=40,variance=10"
+    --prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
+    --decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" 
 ```
 
 Results will be saved in JSON format in current directory.
@@ -105,6 +108,32 @@ Available modes:
 - `rate`: runs a benchmark at a fixed request rate
 - `throughput`: runs a benchmark at a fixed throughput (constant VUs)
 
+Example running a benchmark at a fixed request rates:
+
+```shell 
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+HF_TOKEN=<your HF READ token>
+$ docker run \
+    --rm \
+    -it \
+    --net host \
+    -v $(pwd):/opt/text-generation-inference-benchmark/results \
+    -e "HF_TOKEN=$HF_TOKEN" \
+    ghcr.io/huggingface/text-generation-inference-benchmark:latest \
+    text-generation-inference-benchmark \
+    --tokenizer-name "meta-llama/Llama-3.1-8B-Instruct" \
+    --max-vus 800 \
+    --duration 120s \
+    --url http://localhost:8080 \
+    --warmup 30s \
+    --benchmark-kind rate \
+    --rates 1.0 \
+    --rates 5.0 \
+    --rates 10.0 \
+    --prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
+    --decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10"
+```
+
 #### Dataset configuration
 
 Prompts are sampled for a Hugging Face dataset file, using a [subset of ShareGPT
@@ -186,7 +215,7 @@ $ make build
 
 
 * **Why do I get high error rate when running `thoughput` benchmark?**
-  
+
   Throughput bench tries to saturate the server with a high request rate. The error rate is high because the server is
   not able to handle the request rate or rate limiting the requests.
   In the case of TGI, this is controlled by the `--max-concurrent-requests` option.
@@ -199,7 +228,6 @@ $ make build
   If your CUDA graphs are not evenly distributed, you may see a performance drop at some request rates as batch size may
   fall in a bigger CUDA graph batch size leading to a lost of compute due to excessive padding.
 
-
 ## TODO
 
 - [X] Customizable token count and variance

diff --git a/src/benchmark.rs b/src/benchmark.rs
@@ -50,7 +50,7 @@ pub struct Benchmark {
     backend: Box<dyn TextGenerationBackend + Send + Sync>,
     requests: Arc<Mutex<dyn TextRequestGenerator + Send>>,
     report: BenchmarkReport,
-    config: BenchmarkConfig,
+    pub(crate) config: BenchmarkConfig,
     event_bus: mpsc::UnboundedSender<Event>,
     stop_sender: broadcast::Sender<()>,
 }

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::fs::File;
+use std::io;
 use std::io::Write;
 use std::path::Path;
 use std::sync::Arc;
@@ -10,6 +11,7 @@ use crate::benchmark::{Event, MessageEvent};
 use crate::requests::OpenAITextGenerationBackend;
 pub use crate::requests::TokenizeOptions;
 use chrono::Local;
+use crossterm::ExecutableCommand;
 use log::{debug, error, info, Level, LevelFilter};
 use tokenizers::{FromPretrainedParameters, Tokenizer};
 use tokio::sync::broadcast::Sender;
@@ -26,6 +28,7 @@ mod results;
 mod scheduler;
 mod tokens;
 mod writers;
+mod table;
 
 pub struct RunConfiguration {
     pub url: String,
@@ -144,7 +147,7 @@ pub async fn run(run_config: RunConfiguration, stop_sender: Sender<()>) -> anyho
         run_config.dataset_file,
         run_config.hf_token.clone(),
     )
-    .expect("Can't download dataset");
+        .expect("Can't download dataset");
     let requests = requests::ConversationTextRequestGenerator::load(
         filepath,
         run_config.tokenizer_name.clone(),
@@ -189,6 +192,18 @@ pub async fn run(run_config: RunConfiguration, stop_sender: Sender<()>) -> anyho
         // quit app if not interactive
         let _ = stop_sender.send(());
     }
-    ui_thread.await.unwrap();
+    ui_thread.await?;
+
+    // Revert terminal to original view
+    io::stdout().execute(ratatui::crossterm::terminal::LeaveAlternateScreen)?;
+    ratatui::crossterm::terminal::disable_raw_mode()?;
+    io::stdout().execute(ratatui::crossterm::cursor::Show)?;
+
+    let report = benchmark.get_report();
+    let writer = BenchmarkReportWriter::new(config.clone(), report)?;
+    writer.stdout().await;
+
     Ok(())
 }
+
+
diff --git a/src/main.rs b/src/main.rs
@@ -1,6 +1,6 @@
 use clap::error::ErrorKind::InvalidValue;
 use clap::{Error, Parser};
-use log::debug;
+use log::{debug, error};
 use reqwest::Url;
 use std::collections::HashMap;
 use std::time::Duration;
@@ -17,7 +17,7 @@ struct Args {
     #[clap(default_value = "128", short, long, env)]
     max_vus: u64,
     /// The duration of each benchmark step
-    #[clap(default_value = "60s", short, long, env)]
+    #[clap(default_value = "120s", short, long, env)]
     #[arg(value_parser = parse_duration)]
     duration: Duration,
     /// A list of rates of requests to send per second (only valid for the ConstantArrivalRate benchmark).
@@ -51,9 +51,9 @@ struct Args {
     /// * max_tokens: maximum number of prompt tokens
     /// * variance: variance in the number of prompt tokens
     ///
-    /// Example: num_tokens=50,max_tokens=60,min_tokens=40,variance=10
+    /// Example: num_tokens=200,max_tokens=210,min_tokens=190,variance=10
     #[clap(
-        default_value = "num_tokens=50,max_tokens=60,min_tokens=40,variance=10",
+        default_value = "num_tokens=200,max_tokens=210,min_tokens=190,variance=10",
         long,
         env,
         value_parser(parse_tokenizer_options)
@@ -67,9 +67,9 @@ struct Args {
     /// * max_tokens: maximum number of generated tokens
     /// * variance: variance in the number of generated tokens
     ///
-    /// Example: num_tokens=50,max_tokens=60,min_tokens=40,variance=10
+    /// Example: num_tokens=200,max_tokens=210,min_tokens=190,variance=10
     #[clap(
-        default_value = "num_tokens=50,max_tokens=60,min_tokens=40,variance=10",
+        default_value = "num_tokens=200,max_tokens=210,min_tokens=190,variance=10",
         long,
         env,
         value_parser(parse_tokenizer_options)
@@ -190,6 +190,7 @@ async fn main() {
         match run(run_config, stop_sender_clone).await {
             Ok(_) => {}
             Err(e) => {
+                error!("Fatal: {:?}", e);
                 println!("Fatal: {:?}", e)
             }
         };

diff --git a/src/requests.rs b/src/requests.rs
@@ -1,3 +1,4 @@
+use std::fmt::Display;
 use async_trait::async_trait;
 use futures_util::StreamExt;
 use hf_hub::api::sync::ApiBuilder;
@@ -302,6 +303,19 @@ impl Default for TokenizeOptions {
     }
 }
 
+impl Display for TokenizeOptions {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "num_tokens={num_tokens},min_tokens={min_tokens},max_tokens={max_tokens},variance={variance}",
+            num_tokens = self.num_tokens,
+            min_tokens = self.min_tokens,
+            max_tokens = self.max_tokens,
+            variance = self.variance
+        )
+    }
+}
+
 impl ConversationTextRequestGenerator {
     pub fn load(
         filepath: PathBuf,
@@ -335,7 +349,7 @@ impl ConversationTextRequestGenerator {
             ProgressStyle::with_template(
                 "Tokenizing prompts [{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}",
             )
-            .unwrap(),
+                .unwrap(),
         );
         split(data, entry_splitter).for_each(|subrange| {
             for entry in subrange {
@@ -668,7 +682,7 @@ mod tests {
             "gpt2".to_string(),
             tokenizer,
         )
-        .unwrap();
+            .unwrap();
         let request = TextGenerationRequest {
             prompt: "Hello, world!".to_string(),
             num_prompt_tokens: 2,
@@ -726,7 +740,7 @@ mod tests {
             "gpt2".to_string(),
             tokenizer,
         )
-        .unwrap();
+            .unwrap();
         let request = TextGenerationRequest {
             prompt: "Hello, world!".to_string(),
             num_prompt_tokens: 2,
@@ -778,7 +792,7 @@ mod tests {
         assert!(
             inter_token_latency_avg > expected_inter_token_latency_avg
                 && inter_token_latency_avg
-                    < expected_inter_token_latency_avg + inter_token_latency_overhead,
+                < expected_inter_token_latency_avg + inter_token_latency_overhead,
             "inter_token_latency_avg: {:?} < {:?} < {:?}",
             expected_inter_token_latency_avg,
             inter_token_latency_avg,
@@ -810,7 +824,7 @@ mod tests {
             "gpt2".to_string(),
             tokenizer,
         )
-        .unwrap();
+            .unwrap();
         let request = TextGenerationRequest {
             prompt: "Hello, world!".to_string(),
             num_prompt_tokens: 2,
@@ -855,7 +869,7 @@ mod tests {
             "gpt2".to_string(),
             tokenizer,
         )
-        .unwrap();
+            .unwrap();
         let request = TextGenerationRequest {
             prompt: "Hello, world!".to_string(),
             num_prompt_tokens: 2,
@@ -900,7 +914,7 @@ mod tests {
             "gpt2".to_string(),
             tokenizer,
         )
-        .unwrap();
+            .unwrap();
         let request = TextGenerationRequest {
             prompt: "Hello, world!".to_string(),
             num_prompt_tokens: 2,
@@ -948,7 +962,7 @@ mod tests {
             "gpt2".to_string(),
             tokenizer,
         )
-        .unwrap();
+            .unwrap();
         let request = TextGenerationRequest {
             prompt: "Hello, world!".to_string(),
             num_prompt_tokens: 2,

diff --git a/src/table.rs b/src/table.rs
@@ -0,0 +1,45 @@
+use tabled::builder::Builder;
+use crate::BenchmarkConfig;
+use crate::results::BenchmarkReport;
+
+pub fn parameters_table(benchmark: BenchmarkConfig) -> tabled::Table {
+    let mut builder = Builder::default();
+    let rates = benchmark.rates.map_or("N/A".to_string(), |e| format!("{:?}", e));
+    let prompt_options = benchmark.prompt_options.map_or("N/A".to_string(), |e| format!("{}", e));
+    let decode_options = benchmark.decode_options.map_or("N/A".to_string(), |e| format!("{}", e));
+    let extra_metadata = benchmark.extra_metadata.map_or("N/A".to_string(), |e| format!("{:?}", e));
+    builder.set_header(vec!["Parameter", "Value"]);
+    builder.push_record(vec!["Max VUs", benchmark.max_vus.to_string().as_str()]);
+    builder.push_record(vec!["Duration", benchmark.duration.as_secs().to_string().as_str()]);
+    builder.push_record(vec!["Warmup Duration", benchmark.warmup_duration.as_secs().to_string().as_str()]);
+    builder.push_record(vec!["Benchmark Kind", benchmark.benchmark_kind.to_string().as_str()]);
+    builder.push_record(vec!["Rates", rates.as_str()]);
+    builder.push_record(vec!["Num Rates", benchmark.num_rates.to_string().as_str()]);
+    builder.push_record(vec!["Prompt Options", prompt_options.as_str()]);
+    builder.push_record(vec!["Decode Options", decode_options.as_str()]);
+    builder.push_record(vec!["Tokenizer", benchmark.tokenizer.to_string().as_str()]);
+    builder.push_record(vec!["Extra Metadata", extra_metadata.as_str()]);
+    let mut table = builder.build();
+    table.with(tabled::settings::Style::sharp());
+    table
+}
+
+pub fn results_table(benchmark: BenchmarkReport) -> tabled::Table {
+    let mut builder = Builder::default();
+    builder.set_header(vec!["Benchmark", "QPS", "E2E Latency", "TTFT", "ITL", "Throughput", "Error Rate"]);
+    let results = benchmark.get_results();
+    for result in results {
+        let qps = format!("{:.2} req/s", result.successful_request_rate().unwrap());
+        let e2e = format!("{:.2} sec", result.e2e_latency_avg().unwrap().as_secs_f64());
+        let ttft = format!("{:.2} ms", result.time_to_first_token_avg().unwrap().as_micros() as f64 / 1000.0);
+        let itl = format!("{:.2} ms", result.inter_token_latency_avg().unwrap().as_micros() as f64 / 1000.0);
+        let throughput = format!("{:.2} tokens/sec", result.token_throughput_secs().unwrap());
+        let error_rate = result.failed_requests() / result.total_requests();
+        let error_rate = format!("{:.2}%", error_rate as f64 * 100.0);
+        builder.push_record(vec![result.id.as_str(), qps.as_str(), e2e.as_str(), ttft.as_str(), itl.as_str(), throughput.as_str(), error_rate.as_str()]);
+    }
+    let mut table = builder.build();
+    table
+        .with(tabled::settings::Style::sharp());
+    table
+}