-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add additional info, tests and function to train from tokenizer class
- Loading branch information
1 parent
ae3974c
commit a63e0ab
Showing
4 changed files
with
137 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,8 +9,26 @@ classifiers = [ | |
"Programming Language :: Rust", | ||
"Programming Language :: Python :: Implementation :: CPython", | ||
"Programming Language :: Python :: Implementation :: PyPy", | ||
"Programming Language :: Python :: 3.8", | ||
"Programming Language :: Python :: 3.9", | ||
"Programming Language :: Python :: 3.10", | ||
"Programming Language :: Python :: 3.11", | ||
"Programming Language :: Python :: 3.12", | ||
] | ||
dynamic = ["version"] | ||
description = "Fast bare-bones BPE for modern tokenizer training" | ||
authors = [{author = "Gautier Dagan", email = "<[email protected]>"}] | ||
license = "MIT" | ||
readme = "README.md" | ||
homepage = "https://github.com/gautierdag/bpeasy" | ||
repository = "https://github.com/gautierdag/bpeasy" | ||
include = [ | ||
"LICENSE", | ||
] | ||
keywords = ["tokenizer", "tokenization", "bpe"] | ||
dependencies = [ | ||
"tiktoken>=0.4.0", | ||
] | ||
|
||
[project.optional-dependencies] | ||
dev = ["pytest", "pytest-cov", "black", "tokenizers", "tqdm"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from bpeasy.convert import bpe | ||
|
||
|
||
def test_bpe_function(): | ||
mergeable_ranks = {b"ab": 0, b"bc": 1, b"cd": 2} | ||
token = b"abcd" | ||
result = bpe(mergeable_ranks, token) | ||
assert result == [ | ||
b"ab", | ||
b"cd", | ||
], "The bpe function did not split the token correctly" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import base64 | ||
import json | ||
from unittest import mock | ||
from bpeasy.tokenizer import BPEasyTokenizer | ||
|
||
|
||
def test_initialization(): | ||
vocab = {b"hello": 1, b"world": 2} | ||
tokenizer = BPEasyTokenizer(vocab=vocab) | ||
assert tokenizer.vocab == vocab | ||
assert tokenizer.name == "bpeasy" | ||
assert len(tokenizer.special_tokens) == 0 | ||
assert len(tokenizer) == 2 | ||
|
||
|
||
def test_encode_decode(): | ||
vocab = {b"hello": 1, b" world": 2} | ||
tokenizer = BPEasyTokenizer(vocab=vocab) | ||
encoded = tokenizer.encode("hello world", allowed_special="all") | ||
assert encoded == [1, 2] | ||
decoded = tokenizer.decode(encoded) | ||
assert decoded == "hello world" | ||
|
||
|
||
def test_save_and_load(): | ||
vocab = {b"hello": 1, b" world": 2} | ||
tokenizer = BPEasyTokenizer(vocab=vocab) | ||
|
||
# Test saving | ||
with mock.patch("builtins.open", mock.mock_open()) as mock_file: | ||
tokenizer.save("dummy_path.json") | ||
mock_file.assert_called_once_with("dummy_path.json", "w") | ||
|
||
# Prepare dummy file content for loading | ||
dummy_file_content = json.dumps( | ||
{ | ||
"name": "bpeasy", | ||
"vocab": { | ||
base64.b64encode(key).decode("utf-8"): value | ||
for key, value in vocab.items() | ||
}, | ||
"regex_pattern": tokenizer.regex_pattern, | ||
"special_tokens": tokenizer.special_tokens, | ||
} | ||
) | ||
|
||
# Test loading | ||
with mock.patch( | ||
"builtins.open", mock.mock_open(read_data=dummy_file_content) | ||
) as mock_file: | ||
loaded_tokenizer = BPEasyTokenizer.from_file("dummy_path.json") | ||
assert loaded_tokenizer.vocab == vocab | ||
|
||
|
||
@mock.patch("builtins.open", new_callable=mock.mock_open) | ||
@mock.patch("json.dump") | ||
def test_conversion_to_huggingface(mock_json_dump, mock_open): | ||
vocab = { | ||
b"h": 0, | ||
b"e": 1, | ||
b"l": 2, | ||
b"o": 3, | ||
b" ": 4, | ||
b"w": 5, | ||
b"r": 6, | ||
b"d": 7, | ||
b"he": 8, | ||
b"ll": 9, | ||
b"llo": 10, | ||
b"hello": 11, | ||
b"wo": 12, | ||
b"wor": 13, | ||
b"ld": 14, | ||
b"world": 15, | ||
b" world": 16, | ||
} | ||
tokenizer = BPEasyTokenizer(vocab=vocab) | ||
tokenizer.export_to_huggingface_format("dummy_path.json") | ||
mock_open.assert_called_once_with("dummy_path.json", "w", encoding="utf-8") | ||
mock_json_dump.assert_called_once() | ||
args, _ = mock_json_dump.call_args | ||
assert args[0]["model"]["type"] == "BPE" |