From a63e0aba548d7b1575d2a3c2f1940202cd44c3c9 Mon Sep 17 00:00:00 2001
From: Gautier Dagan <s2234411@ed.ac.uk>
Date: Fri, 15 Dec 2023 14:30:02 +0000
Subject: [PATCH] add additional info, tests and function to train from
 tokenizer class

---
 bpeasy/tokenizer.py     | 27 +++++++++++++-
 pyproject.toml          | 18 +++++++++
 tests/test_convert.py   | 11 ++++++
 tests/test_tokenizer.py | 82 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_convert.py
 create mode 100644 tests/test_tokenizer.py

diff --git a/bpeasy/tokenizer.py b/bpeasy/tokenizer.py
index afeeed9..cdbd780 100644
--- a/bpeasy/tokenizer.py
+++ b/bpeasy/tokenizer.py
@@ -1,16 +1,21 @@
 import json
 import base64
+from typing import Iterator
 
 import tiktoken
 
+from .bpeasy import train_bpe
 from .convert import convert_tiktoken_to_huggingface
 
 
+_DEFAULT_REGEX_PATTERN = r"""[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+
+
 class BPEasyTokenizer:
     def __init__(
         self,
         vocab: dict[bytes, int],
-        regex_pattern: str = r"""[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
+        regex_pattern: str = _DEFAULT_REGEX_PATTERN,
         special_tokens: list[str] = [],
         fill_to_nearest_multiple_of_eight=False,
         name="bpeasy",
@@ -94,3 +99,23 @@ def export_to_huggingface_format(self, out_path: str) -> None:
 
     def __len__(self) -> int:
         return len(self.vocab)
+
+    @classmethod
+    def train(
+        cls,
+        iterator: Iterator[str],
+        vocab_size: int = 32_000,
+        max_token_length=128,
+        regex_pattern: str = _DEFAULT_REGEX_PATTERN,
+        special_tokens: list[str] = [],
+        fill_to_nearest_multiple_of_eight=False,
+        name="bpeasy",
+    ) -> "BPEasyTokenizer":
+        bytes_vocab = train_bpe(iterator, regex_pattern, max_token_length, vocab_size)
+        return cls(
+            name=name,
+            vocab=bytes_vocab,
+            regex_pattern=regex_pattern,
+            special_tokens=special_tokens,
+            fill_to_nearest_multiple_of_eight=fill_to_nearest_multiple_of_eight,
+        )
diff --git a/pyproject.toml b/pyproject.toml
index 7c8a55f..a18e861 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,8 +9,26 @@ classifiers = [
     "Programming Language :: Rust",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 dynamic = ["version"]
+description = "Fast bare-bones BPE for modern tokenizer training"
+authors = [{author = "Gautier Dagan", email = "<gautier.dagan@ed.ac.uk>"}]
+license = "MIT"
+readme = "README.md"
+homepage = "https://github.com/gautierdag/bpeasy"
+repository = "https://github.com/gautierdag/bpeasy"
+include = [
+    "LICENSE",
+]
+keywords = ["tokenizer", "tokenization", "bpe"]
+dependencies = [
+    "tiktoken>=0.4.0",
+]
 
 [project.optional-dependencies]
 dev = ["pytest", "pytest-cov", "black", "tokenizers", "tqdm"]
diff --git a/tests/test_convert.py b/tests/test_convert.py
new file mode 100644
index 0000000..80b87df
--- /dev/null
+++ b/tests/test_convert.py
@@ -0,0 +1,11 @@
+from bpeasy.convert import bpe
+
+
+def test_bpe_function():
+    mergeable_ranks = {b"ab": 0, b"bc": 1, b"cd": 2}
+    token = b"abcd"
+    result = bpe(mergeable_ranks, token)
+    assert result == [
+        b"ab",
+        b"cd",
+    ], "The bpe function did not split the token correctly"
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
new file mode 100644
index 0000000..f65707a
--- /dev/null
+++ b/tests/test_tokenizer.py
@@ -0,0 +1,82 @@
+import base64
+import json
+from unittest import mock
+from bpeasy.tokenizer import BPEasyTokenizer
+
+
+def test_initialization():
+    vocab = {b"hello": 1, b"world": 2}
+    tokenizer = BPEasyTokenizer(vocab=vocab)
+    assert tokenizer.vocab == vocab
+    assert tokenizer.name == "bpeasy"
+    assert len(tokenizer.special_tokens) == 0
+    assert len(tokenizer) == 2
+
+
+def test_encode_decode():
+    vocab = {b"hello": 1, b" world": 2}
+    tokenizer = BPEasyTokenizer(vocab=vocab)
+    encoded = tokenizer.encode("hello world", allowed_special="all")
+    assert encoded == [1, 2]
+    decoded = tokenizer.decode(encoded)
+    assert decoded == "hello world"
+
+
+def test_save_and_load():
+    vocab = {b"hello": 1, b" world": 2}
+    tokenizer = BPEasyTokenizer(vocab=vocab)
+
+    # Test saving
+    with mock.patch("builtins.open", mock.mock_open()) as mock_file:
+        tokenizer.save("dummy_path.json")
+        mock_file.assert_called_once_with("dummy_path.json", "w")
+
+    # Prepare dummy file content for loading
+    dummy_file_content = json.dumps(
+        {
+            "name": "bpeasy",
+            "vocab": {
+                base64.b64encode(key).decode("utf-8"): value
+                for key, value in vocab.items()
+            },
+            "regex_pattern": tokenizer.regex_pattern,
+            "special_tokens": tokenizer.special_tokens,
+        }
+    )
+
+    # Test loading
+    with mock.patch(
+        "builtins.open", mock.mock_open(read_data=dummy_file_content)
+    ) as mock_file:
+        loaded_tokenizer = BPEasyTokenizer.from_file("dummy_path.json")
+        assert loaded_tokenizer.vocab == vocab
+
+
+@mock.patch("builtins.open", new_callable=mock.mock_open)
+@mock.patch("json.dump")
+def test_conversion_to_huggingface(mock_json_dump, mock_open):
+    vocab = {
+        b"h": 0,
+        b"e": 1,
+        b"l": 2,
+        b"o": 3,
+        b" ": 4,
+        b"w": 5,
+        b"r": 6,
+        b"d": 7,
+        b"he": 8,
+        b"ll": 9,
+        b"llo": 10,
+        b"hello": 11,
+        b"wo": 12,
+        b"wor": 13,
+        b"ld": 14,
+        b"world": 15,
+        b" world": 16,
+    }
+    tokenizer = BPEasyTokenizer(vocab=vocab)
+    tokenizer.export_to_huggingface_format("dummy_path.json")
+    mock_open.assert_called_once_with("dummy_path.json", "w", encoding="utf-8")
+    mock_json_dump.assert_called_once()
+    args, _ = mock_json_dump.call_args
+    assert args[0]["model"]["type"] == "BPE"