From a63e0aba548d7b1575d2a3c2f1940202cd44c3c9 Mon Sep 17 00:00:00 2001 From: Gautier Dagan Date: Fri, 15 Dec 2023 14:30:02 +0000 Subject: [PATCH] add additional info, tests and function to train from tokenizer class --- bpeasy/tokenizer.py | 27 +++++++++++++- pyproject.toml | 18 +++++++++ tests/test_convert.py | 11 ++++++ tests/test_tokenizer.py | 82 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 tests/test_convert.py create mode 100644 tests/test_tokenizer.py diff --git a/bpeasy/tokenizer.py b/bpeasy/tokenizer.py index afeeed9..cdbd780 100644 --- a/bpeasy/tokenizer.py +++ b/bpeasy/tokenizer.py @@ -1,16 +1,21 @@ import json import base64 +from typing import Iterator import tiktoken +from .bpeasy import train_bpe from .convert import convert_tiktoken_to_huggingface +_DEFAULT_REGEX_PATTERN = r"""[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" + + class BPEasyTokenizer: def __init__( self, vocab: dict[bytes, int], - regex_pattern: str = r"""[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", + regex_pattern: str = _DEFAULT_REGEX_PATTERN, special_tokens: list[str] = [], fill_to_nearest_multiple_of_eight=False, name="bpeasy", @@ -94,3 +99,23 @@ def export_to_huggingface_format(self, out_path: str) -> None: def __len__(self) -> int: return len(self.vocab) + + @classmethod + def train( + cls, + iterator: Iterator[str], + vocab_size: int = 32_000, + max_token_length=128, + regex_pattern: str = _DEFAULT_REGEX_PATTERN, + special_tokens: list[str] = [], + fill_to_nearest_multiple_of_eight=False, + name="bpeasy", + ) -> "BPEasyTokenizer": + bytes_vocab = train_bpe(iterator, regex_pattern, max_token_length, vocab_size) + return cls( + name=name, + vocab=bytes_vocab, + regex_pattern=regex_pattern, + special_tokens=special_tokens, + fill_to_nearest_multiple_of_eight=fill_to_nearest_multiple_of_eight, + ) diff --git a/pyproject.toml b/pyproject.toml index 7c8a55f..a18e861 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,8 +9,26 @@ classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] dynamic = ["version"] +description = "Fast bare-bones BPE for modern tokenizer training" +authors = [{author = "Gautier Dagan", email = ""}] +license = "MIT" +readme = "README.md" +homepage = "https://github.com/gautierdag/bpeasy" +repository = "https://github.com/gautierdag/bpeasy" +include = [ + "LICENSE", +] +keywords = ["tokenizer", "tokenization", "bpe"] +dependencies = [ + "tiktoken>=0.4.0", +] [project.optional-dependencies] dev = ["pytest", "pytest-cov", "black", "tokenizers", "tqdm"] diff --git a/tests/test_convert.py b/tests/test_convert.py new file mode 100644 index 0000000..80b87df --- /dev/null +++ b/tests/test_convert.py @@ -0,0 +1,11 @@ +from bpeasy.convert import bpe + + +def test_bpe_function(): + mergeable_ranks = {b"ab": 0, b"bc": 1, b"cd": 2} + token = b"abcd" + result = bpe(mergeable_ranks, token) + assert result == [ + b"ab", + b"cd", + ], "The bpe function did not split the token correctly" diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py new file mode 100644 index 0000000..f65707a --- /dev/null +++ b/tests/test_tokenizer.py @@ -0,0 +1,82 @@ +import base64 +import json +from unittest import mock +from bpeasy.tokenizer import BPEasyTokenizer + + +def test_initialization(): + vocab = {b"hello": 1, b"world": 2} + tokenizer = BPEasyTokenizer(vocab=vocab) + assert tokenizer.vocab == vocab + assert tokenizer.name == "bpeasy" + assert len(tokenizer.special_tokens) == 0 + assert len(tokenizer) == 2 + + +def test_encode_decode(): + vocab = {b"hello": 1, b" world": 2} + tokenizer = BPEasyTokenizer(vocab=vocab) + encoded = tokenizer.encode("hello world", allowed_special="all") + assert encoded == [1, 2] + decoded = tokenizer.decode(encoded) + assert decoded == "hello world" + + +def test_save_and_load(): + vocab = {b"hello": 1, b" world": 2} + tokenizer = BPEasyTokenizer(vocab=vocab) + + # Test saving + with mock.patch("builtins.open", mock.mock_open()) as mock_file: + tokenizer.save("dummy_path.json") + mock_file.assert_called_once_with("dummy_path.json", "w") + + # Prepare dummy file content for loading + dummy_file_content = json.dumps( + { + "name": "bpeasy", + "vocab": { + base64.b64encode(key).decode("utf-8"): value + for key, value in vocab.items() + }, + "regex_pattern": tokenizer.regex_pattern, + "special_tokens": tokenizer.special_tokens, + } + ) + + # Test loading + with mock.patch( + "builtins.open", mock.mock_open(read_data=dummy_file_content) + ) as mock_file: + loaded_tokenizer = BPEasyTokenizer.from_file("dummy_path.json") + assert loaded_tokenizer.vocab == vocab + + +@mock.patch("builtins.open", new_callable=mock.mock_open) +@mock.patch("json.dump") +def test_conversion_to_huggingface(mock_json_dump, mock_open): + vocab = { + b"h": 0, + b"e": 1, + b"l": 2, + b"o": 3, + b" ": 4, + b"w": 5, + b"r": 6, + b"d": 7, + b"he": 8, + b"ll": 9, + b"llo": 10, + b"hello": 11, + b"wo": 12, + b"wor": 13, + b"ld": 14, + b"world": 15, + b" world": 16, + } + tokenizer = BPEasyTokenizer(vocab=vocab) + tokenizer.export_to_huggingface_format("dummy_path.json") + mock_open.assert_called_once_with("dummy_path.json", "w", encoding="utf-8") + mock_json_dump.assert_called_once() + args, _ = mock_json_dump.call_args + assert args[0]["model"]["type"] == "BPE"