Skip to content

Commit

Permalink
add benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
gautierdag committed Dec 13, 2023
1 parent f909b74 commit 96bc4b3
Show file tree
Hide file tree
Showing 6 changed files with 1,069 additions and 662 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
out/
.nox/
.benchmarks/

# Generated by Cargo
# will have compiled files and executables
Expand Down
1,600 changes: 1,000 additions & 600 deletions benchmarks/data/c4.jsonl

Large diffs are not rendered by default.

118 changes: 60 additions & 58 deletions benchmarks/train.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,46 @@
import dataclasses
import glob
import json
import pytest
import logging
import os
import sys
import itertools
import glob
import dataclasses
import time
from contextlib import contextmanager
from pathlib import Path

import tokenizers
from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tqdm import tqdm

import bpeasy

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)


@contextmanager
def suppress_stdout():
with open(os.devnull, "w") as devnull:
old_stdout = sys.stdout
sys.stdout = devnull
try:
yield
finally:
sys.stdout = old_stdout


@dataclasses.dataclass
class TrainBPETokenizerArgs:
datasets: str = "./benchmarks/data"
dataset: str = "./benchmarks/data"

num_characters: int = 1000
vocab_size: int = 1024
max_sentencepiece_length: int = 32
vocab_size: int = 32_000
max_sentencepiece_length: int = 64
normalization_rule_name: str = "gpt"

def __post_init__(self):
datasets = self.datasets.split(",")
for ckpt in datasets:
checkpoint_dir = Path(ckpt)
assert checkpoint_dir.is_dir(), checkpoint_dir
checkpoint_dir = Path(self.dataset)
assert checkpoint_dir.is_dir(), checkpoint_dir

assert self.normalization_rule_name in [
"gpt",
Expand All @@ -56,27 +66,23 @@ def get_content_key(path: str) -> str:


def jsonl_content_iterator(
file_path: str,
character_limit=2_000_000,
args: TrainBPETokenizerArgs,
):
"""
Iterates over a jsonl file and yields the content of each line
Tracks the number of characters yielded and stops when the limit is reached
This is ripe for optimisation if you want to mess with more fine-grained
character limits (eg. more Python than Java)
"""
logging.info(f"Creating iterator for {character_limit} characters in {file_path}")
file_path = args.dataset
chunk_num, character_count = 0, 0
chunks = glob.glob(f"{file_path}/*.jsonl")
logging.info(f"Found {len(chunks)} chunks")

while character_count < character_limit and chunk_num < len(chunks):
while chunk_num < len(chunks):
file_name = chunks[chunk_num]
content_key = get_content_key(file_name)
with open(file_name, "r", encoding="utf-8") as f:
for line in f:
if character_count >= character_limit: # stop after limit
break
try:
obj = json.loads(line)
text = obj[content_key]
Expand All @@ -88,28 +94,6 @@ def jsonl_content_iterator(
chunk_num += 1


def mix_jsonl_content_iterator(args: TrainBPETokenizerArgs):
datasets = []
num_datasets = len(args.datasets.split(","))
for dataset in args.datasets.split(","):
datasets.append((dataset, args.code_percentage / num_datasets))

# Create iterators
iterators = []
total_weight = sum([t[1] for t in datasets])
for file_path, percentage in datasets:
effective_limit = int((percentage / total_weight) * args.num_characters)
assert effective_limit > 0
it = jsonl_content_iterator(
file_path,
effective_limit,
)
iterators.append(it)

# Chain iterators together
return itertools.chain(*iterators)


def get_regex_from_normalization_rule_name(normalization_rule_name: str) -> str:
# GPT4 regex
if normalization_rule_name == "gpt":
Expand All @@ -127,20 +111,15 @@ def get_regex_from_normalization_rule_name(normalization_rule_name: str) -> str:
raise ValueError(f"Unknown normalization_rule_name {normalization_rule_name}")


@pytest.fixture(scope="session")
def args() -> str:
return TrainBPETokenizerArgs()


def test_train_huggingface(benchmark, args: TrainBPETokenizerArgs):
def train_huggingface(args: TrainBPETokenizerArgs):
# should be at least 0.14.0 to train with char limit
assert tokenizers.__version__ >= "0.14.0"
tokenizer = Tokenizer(BPE(byte_fallback=True))
trainer = BpeTrainer(
vocab_size=args.vocab_size,
show_progress=True,
special_tokens=[f"<0x{i:02X}>" for i in range(256)], # seed sm vocab
max_token_length=args.max_sentencepiece_length,
show_progress=False,
)
regex_expression = get_regex_from_normalization_rule_name(
args.normalization_rule_name
Expand All @@ -160,24 +139,47 @@ def test_train_huggingface(benchmark, args: TrainBPETokenizerArgs):
tokenizer.decoder = decoders.Sequence(
[decoders.ByteLevel(), decoders.ByteFallback()]
)
iterator = mix_jsonl_content_iterator(args)
iterator = jsonl_content_iterator(args)
# training the tokenizer
benchmark(
tokenizer.train_from_iterator,
iterator,
trainer,
)
with suppress_stdout():
tokenizer.train_from_iterator(iterator, trainer)


def test_train_bpeasy(benchmark, args: TrainBPETokenizerArgs):
def train_bpeasy(args: TrainBPETokenizerArgs):
# Use ByteLevel Decoder
iterator = mix_jsonl_content_iterator(args)
iterator = jsonl_content_iterator(args)
# training the tokenizer
regex = get_regex_from_normalization_rule_name(args.normalization_rule_name)
benchmark(
bpeasy.train_bpe,

bpeasy.train_bpe(
iterator,
regex,
args.max_sentencepiece_length,
args.vocab_size,
)


if __name__ == "__main__":
NUM_ITERATIONS = 100
args = TrainBPETokenizerArgs()

times_huggingface = []
times_bpeasy = []
for i in tqdm(range(NUM_ITERATIONS)):
time_now = time.time()
train_huggingface(args)
times_huggingface.append(time.time() - time_now)

time_now = time.time()
train_bpeasy(args)
times_bpeasy.append(time.time() - time_now)

avg_time_huggingface = sum(times_huggingface) / len(times_huggingface)
avg_time_bpeasy = sum(times_bpeasy) / len(times_bpeasy)
std_dev_huggingface = sum(
[(t - avg_time_huggingface) ** 2 for t in times_huggingface]
)
std_dev_bpeasy = sum([(t - avg_time_bpeasy) ** 2 for t in times_bpeasy])

print(f"huggingface {avg_time_huggingface} +/- {std_dev_huggingface}")
print(f"bpeasy {avg_time_bpeasy} +/- {std_dev_bpeasy}")
1 change: 0 additions & 1 deletion bpeasy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
]



def save_vocab_to_tiktoken(
vocab: dict[bytes, int],
out_path: str,
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,9 @@ classifiers = [
]
dynamic = ["version"]

[project.optional-dependencies]
dev = ["pytest"]
bench = ["tokenizers", "tqdm"]

[tool.maturin]
features = ["pyo3/extension-module"]
6 changes: 3 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,9 @@ fn pretokenize<'a>(text: &'a str, regex: &Regex) -> Vec<&'a str> {
}

fn pretokenize_strings(strings: Vec<&str>, pattern: &str) -> (Vec<Sentence>, Vec<u64>) {
let regex = Regex::new(pattern).expect("Invalid regex pattern");
let regex: Regex = Regex::new(pattern).expect("Invalid regex pattern");
let (tokens, counts): (Vec<&str>, Vec<u64>) = strings
.par_iter()
.filter(|text| !text.is_empty())
.flat_map(|&text| pretokenize(text, &regex))
.fold(
|| HashMap::new(),
Expand All @@ -166,7 +165,7 @@ fn pretokenize_strings(strings: Vec<&str>, pattern: &str) -> (Vec<Sentence>, Vec
)
.into_iter()
.unzip();

let sentences: Vec<Sentence> = tokens.into_iter().map(Sentence::from_str).collect();
(sentences, counts)
}
Expand Down Expand Up @@ -372,6 +371,7 @@ fn train_bpe(
.and_then(|py_string| py_string.to_str().ok())
})
})
.filter(|text| !text.is_empty())
.collect();

let (pretokenized_sentences, counts): (Vec<Sentence>, Vec<u64>) =
Expand Down

0 comments on commit 96bc4b3

Please sign in to comment.