add benchmark

gautierdag · Dec 13, 2023 · 96bc4b3 · 96bc4b3
1 parent f909b74
commit 96bc4b3
Show file tree

Hide file tree

Showing 6 changed files with 1,069 additions and 662 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 out/
+.nox/
+.benchmarks/
 
 # Generated by Cargo
 # will have compiled files and executables

diff --git a/benchmarks/data/c4.jsonl b/benchmarks/data/c4.jsonl
diff --git a/benchmarks/train.py b/benchmarks/train.py
@@ -1,36 +1,46 @@
+import dataclasses
+import glob
 import json
-import pytest
 import logging
+import os
 import sys
-import itertools
-import glob
-import dataclasses
+import time
+from contextlib import contextmanager
 from pathlib import Path
 
 import tokenizers
 from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers
 from tokenizers.models import BPE
 from tokenizers.trainers import BpeTrainer
+from tqdm import tqdm
 
 import bpeasy
 
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 
 
+@contextmanager
+def suppress_stdout():
+    with open(os.devnull, "w") as devnull:
+        old_stdout = sys.stdout
+        sys.stdout = devnull
+        try:
+            yield
+        finally:
+            sys.stdout = old_stdout
+
+
 @dataclasses.dataclass
 class TrainBPETokenizerArgs:
-    datasets: str = "./benchmarks/data"
+    dataset: str = "./benchmarks/data"
 
-    num_characters: int = 1000
-    vocab_size: int = 1024
-    max_sentencepiece_length: int = 32
+    vocab_size: int = 32_000
+    max_sentencepiece_length: int = 64
     normalization_rule_name: str = "gpt"
 
     def __post_init__(self):
-        datasets = self.datasets.split(",")
-        for ckpt in datasets:
-            checkpoint_dir = Path(ckpt)
-            assert checkpoint_dir.is_dir(), checkpoint_dir
+        checkpoint_dir = Path(self.dataset)
+        assert checkpoint_dir.is_dir(), checkpoint_dir
 
         assert self.normalization_rule_name in [
             "gpt",
@@ -56,27 +66,23 @@ def get_content_key(path: str) -> str:
 
 
 def jsonl_content_iterator(
-    file_path: str,
-    character_limit=2_000_000,
+    args: TrainBPETokenizerArgs,
 ):
     """
     Iterates over a jsonl file and yields the content of each line
     Tracks the number of characters yielded and stops when the limit is reached
     This is ripe for optimisation if you want to mess with more fine-grained
     character limits (eg. more Python than Java)
     """
-    logging.info(f"Creating iterator for {character_limit} characters in {file_path}")
+    file_path = args.dataset
     chunk_num, character_count = 0, 0
     chunks = glob.glob(f"{file_path}/*.jsonl")
-    logging.info(f"Found {len(chunks)} chunks")
 
-    while character_count < character_limit and chunk_num < len(chunks):
+    while chunk_num < len(chunks):
         file_name = chunks[chunk_num]
         content_key = get_content_key(file_name)
         with open(file_name, "r", encoding="utf-8") as f:
             for line in f:
-                if character_count >= character_limit:  # stop after limit
-                    break
                 try:
                     obj = json.loads(line)
                     text = obj[content_key]
@@ -88,28 +94,6 @@ def jsonl_content_iterator(
         chunk_num += 1
 
 
-def mix_jsonl_content_iterator(args: TrainBPETokenizerArgs):
-    datasets = []
-    num_datasets = len(args.datasets.split(","))
-    for dataset in args.datasets.split(","):
-        datasets.append((dataset, args.code_percentage / num_datasets))
-
-    # Create iterators
-    iterators = []
-    total_weight = sum([t[1] for t in datasets])
-    for file_path, percentage in datasets:
-        effective_limit = int((percentage / total_weight) * args.num_characters)
-        assert effective_limit > 0
-        it = jsonl_content_iterator(
-            file_path,
-            effective_limit,
-        )
-        iterators.append(it)
-
-    # Chain iterators together
-    return itertools.chain(*iterators)
-
-
 def get_regex_from_normalization_rule_name(normalization_rule_name: str) -> str:
     # GPT4 regex
     if normalization_rule_name == "gpt":
@@ -127,20 +111,15 @@ def get_regex_from_normalization_rule_name(normalization_rule_name: str) -> str:
         raise ValueError(f"Unknown normalization_rule_name {normalization_rule_name}")
 
 
-@pytest.fixture(scope="session")
-def args() -> str:
-    return TrainBPETokenizerArgs()
-
-
-def test_train_huggingface(benchmark, args: TrainBPETokenizerArgs):
+def train_huggingface(args: TrainBPETokenizerArgs):
     # should be at least 0.14.0 to train with char limit
     assert tokenizers.__version__ >= "0.14.0"
     tokenizer = Tokenizer(BPE(byte_fallback=True))
     trainer = BpeTrainer(
         vocab_size=args.vocab_size,
-        show_progress=True,
         special_tokens=[f"<0x{i:02X}>" for i in range(256)],  # seed sm vocab
         max_token_length=args.max_sentencepiece_length,
+        show_progress=False,
     )
     regex_expression = get_regex_from_normalization_rule_name(
         args.normalization_rule_name
@@ -160,24 +139,47 @@ def test_train_huggingface(benchmark, args: TrainBPETokenizerArgs):
     tokenizer.decoder = decoders.Sequence(
         [decoders.ByteLevel(), decoders.ByteFallback()]
     )
-    iterator = mix_jsonl_content_iterator(args)
+    iterator = jsonl_content_iterator(args)
     # training the tokenizer
-    benchmark(
-        tokenizer.train_from_iterator,
-        iterator,
-        trainer,
-    )
+    with suppress_stdout():
+        tokenizer.train_from_iterator(iterator, trainer)
 
 
-def test_train_bpeasy(benchmark, args: TrainBPETokenizerArgs):
+def train_bpeasy(args: TrainBPETokenizerArgs):
     # Use ByteLevel Decoder
-    iterator = mix_jsonl_content_iterator(args)
+    iterator = jsonl_content_iterator(args)
     # training the tokenizer
     regex = get_regex_from_normalization_rule_name(args.normalization_rule_name)
-    benchmark(
-        bpeasy.train_bpe,
+
+    bpeasy.train_bpe(
         iterator,
         regex,
         args.max_sentencepiece_length,
         args.vocab_size,
     )
+
+
+if __name__ == "__main__":
+    NUM_ITERATIONS = 100
+    args = TrainBPETokenizerArgs()
+
+    times_huggingface = []
+    times_bpeasy = []
+    for i in tqdm(range(NUM_ITERATIONS)):
+        time_now = time.time()
+        train_huggingface(args)
+        times_huggingface.append(time.time() - time_now)
+
+        time_now = time.time()
+        train_bpeasy(args)
+        times_bpeasy.append(time.time() - time_now)
+
+    avg_time_huggingface = sum(times_huggingface) / len(times_huggingface)
+    avg_time_bpeasy = sum(times_bpeasy) / len(times_bpeasy)
+    std_dev_huggingface = sum(
+        [(t - avg_time_huggingface) ** 2 for t in times_huggingface]
+    )
+    std_dev_bpeasy = sum([(t - avg_time_bpeasy) ** 2 for t in times_bpeasy])
+
+    print(f"huggingface {avg_time_huggingface} +/- {std_dev_huggingface}")
+    print(f"bpeasy {avg_time_bpeasy} +/- {std_dev_bpeasy}")
diff --git a/bpeasy/__init__.py b/bpeasy/__init__.py
@@ -6,7 +6,6 @@
 ]
 
 
-
 def save_vocab_to_tiktoken(
     vocab: dict[bytes, int],
     out_path: str,

diff --git a/pyproject.toml b/pyproject.toml
@@ -12,5 +12,9 @@ classifiers = [
 ]
 dynamic = ["version"]
 
+[project.optional-dependencies]
+dev = ["pytest"]
+bench = ["tokenizers", "tqdm"]
+
 [tool.maturin]
 features = ["pyo3/extension-module"]
diff --git a/src/lib.rs b/src/lib.rs
@@ -143,10 +143,9 @@ fn pretokenize<'a>(text: &'a str, regex: &Regex) -> Vec<&'a str> {
 }
 
 fn pretokenize_strings(strings: Vec<&str>, pattern: &str) -> (Vec<Sentence>, Vec<u64>) {
-    let regex = Regex::new(pattern).expect("Invalid regex pattern");
+    let regex: Regex = Regex::new(pattern).expect("Invalid regex pattern");
     let (tokens, counts): (Vec<&str>, Vec<u64>) = strings
         .par_iter()
-        .filter(|text| !text.is_empty())
         .flat_map(|&text| pretokenize(text, &regex))
         .fold(
             || HashMap::new(),
@@ -166,7 +165,7 @@ fn pretokenize_strings(strings: Vec<&str>, pattern: &str) -> (Vec<Sentence>, Vec
         )
         .into_iter()
         .unzip();
- 
+
     let sentences: Vec<Sentence> = tokens.into_iter().map(Sentence::from_str).collect();
     (sentences, counts)
 }
@@ -372,6 +371,7 @@ fn train_bpe(
                     .and_then(|py_string| py_string.to_str().ok())
             })
         })
+        .filter(|text| !text.is_empty())
         .collect();
 
     let (pretokenized_sentences, counts): (Vec<Sentence>, Vec<u64>) =
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,7 +6,6 @@ @@
     ]
     def save_vocab_to_tiktoken(
         vocab: dict[bytes, int],
         out_path: str,
@@ Expand Down @@