Skip to content

Commit

Permalink
add additional info, tests and function to train from tokenizer class
Browse files Browse the repository at this point in the history
  • Loading branch information
gautierdag committed Dec 15, 2023
1 parent ae3974c commit a63e0ab
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 1 deletion.
27 changes: 26 additions & 1 deletion bpeasy/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import json
import base64
from typing import Iterator

import tiktoken

from .bpeasy import train_bpe
from .convert import convert_tiktoken_to_huggingface


_DEFAULT_REGEX_PATTERN = r"""[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""


class BPEasyTokenizer:
def __init__(
self,
vocab: dict[bytes, int],
regex_pattern: str = r"""[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
regex_pattern: str = _DEFAULT_REGEX_PATTERN,
special_tokens: list[str] = [],
fill_to_nearest_multiple_of_eight=False,
name="bpeasy",
Expand Down Expand Up @@ -94,3 +99,23 @@ def export_to_huggingface_format(self, out_path: str) -> None:

def __len__(self) -> int:
return len(self.vocab)

@classmethod
def train(
cls,
iterator: Iterator[str],
vocab_size: int = 32_000,
max_token_length=128,
regex_pattern: str = _DEFAULT_REGEX_PATTERN,
special_tokens: list[str] = [],
fill_to_nearest_multiple_of_eight=False,
name="bpeasy",
) -> "BPEasyTokenizer":
bytes_vocab = train_bpe(iterator, regex_pattern, max_token_length, vocab_size)
return cls(
name=name,
vocab=bytes_vocab,
regex_pattern=regex_pattern,
special_tokens=special_tokens,
fill_to_nearest_multiple_of_eight=fill_to_nearest_multiple_of_eight,
)
18 changes: 18 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,26 @@ classifiers = [
"Programming Language :: Rust",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
dynamic = ["version"]
description = "Fast bare-bones BPE for modern tokenizer training"
authors = [{author = "Gautier Dagan", email = "<[email protected]>"}]
license = "MIT"
readme = "README.md"
homepage = "https://github.com/gautierdag/bpeasy"
repository = "https://github.com/gautierdag/bpeasy"
include = [
"LICENSE",
]
keywords = ["tokenizer", "tokenization", "bpe"]
dependencies = [
"tiktoken>=0.4.0",
]

[project.optional-dependencies]
dev = ["pytest", "pytest-cov", "black", "tokenizers", "tqdm"]
Expand Down
11 changes: 11 additions & 0 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from bpeasy.convert import bpe


def test_bpe_function():
mergeable_ranks = {b"ab": 0, b"bc": 1, b"cd": 2}
token = b"abcd"
result = bpe(mergeable_ranks, token)
assert result == [
b"ab",
b"cd",
], "The bpe function did not split the token correctly"
82 changes: 82 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import base64
import json
from unittest import mock
from bpeasy.tokenizer import BPEasyTokenizer


def test_initialization():
vocab = {b"hello": 1, b"world": 2}
tokenizer = BPEasyTokenizer(vocab=vocab)
assert tokenizer.vocab == vocab
assert tokenizer.name == "bpeasy"
assert len(tokenizer.special_tokens) == 0
assert len(tokenizer) == 2


def test_encode_decode():
vocab = {b"hello": 1, b" world": 2}
tokenizer = BPEasyTokenizer(vocab=vocab)
encoded = tokenizer.encode("hello world", allowed_special="all")
assert encoded == [1, 2]
decoded = tokenizer.decode(encoded)
assert decoded == "hello world"


def test_save_and_load():
vocab = {b"hello": 1, b" world": 2}
tokenizer = BPEasyTokenizer(vocab=vocab)

# Test saving
with mock.patch("builtins.open", mock.mock_open()) as mock_file:
tokenizer.save("dummy_path.json")
mock_file.assert_called_once_with("dummy_path.json", "w")

# Prepare dummy file content for loading
dummy_file_content = json.dumps(
{
"name": "bpeasy",
"vocab": {
base64.b64encode(key).decode("utf-8"): value
for key, value in vocab.items()
},
"regex_pattern": tokenizer.regex_pattern,
"special_tokens": tokenizer.special_tokens,
}
)

# Test loading
with mock.patch(
"builtins.open", mock.mock_open(read_data=dummy_file_content)
) as mock_file:
loaded_tokenizer = BPEasyTokenizer.from_file("dummy_path.json")
assert loaded_tokenizer.vocab == vocab


@mock.patch("builtins.open", new_callable=mock.mock_open)
@mock.patch("json.dump")
def test_conversion_to_huggingface(mock_json_dump, mock_open):
vocab = {
b"h": 0,
b"e": 1,
b"l": 2,
b"o": 3,
b" ": 4,
b"w": 5,
b"r": 6,
b"d": 7,
b"he": 8,
b"ll": 9,
b"llo": 10,
b"hello": 11,
b"wo": 12,
b"wor": 13,
b"ld": 14,
b"world": 15,
b" world": 16,
}
tokenizer = BPEasyTokenizer(vocab=vocab)
tokenizer.export_to_huggingface_format("dummy_path.json")
mock_open.assert_called_once_with("dummy_path.json", "w", encoding="utf-8")
mock_json_dump.assert_called_once()
args, _ = mock_json_dump.call_args
assert args[0]["model"]["type"] == "BPE"

0 comments on commit a63e0ab

Please sign in to comment.