Skip to content

Commit

Permalink
fix bug in lib and expand benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
gautierdag committed Dec 14, 2023
1 parent 639b996 commit ec1a20a
Show file tree
Hide file tree
Showing 7 changed files with 376 additions and 396 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

[![codecov](https://codecov.io/gh/gautierdag/bpeasy/branch/main/graph/badge.svg?token=NWHDJ22L8I)](https://codecov.io/gh/gautierdag/bpeasy)

[![tests](https://github.com/gautierdag/bpeasy/actions/workflows/test.yml/badge.svg)](https://github.com/gautierdag/bpeasy/actions/workflows/test.yml)
[![tests](https://github.com/gautierdag/bpeasy/actions/workflows/CI.yml/badge.svg)](https://github.com/gautierdag/bpeasy/actions/workflows/CI.yml)
[![image](https://img.shields.io/pypi/v/bpeasy.svg)](https://pypi.python.org/pypi/bpeasy)
[![image](https://img.shields.io/pypi/l/bpeasy.svg)](https://pypi.python.org/pypi/bpeasy)
[![image](https://img.shields.io/pypi/pyversions/bpeasy.svg)](https://pypi.python.org/pypi/bpeasy)
Expand Down
344 changes: 344 additions & 0 deletions benchmarks/notebook.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,344 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('<0xFF>', 255), ('!', 256), ('\"', 257), ('#', 258), ('$', 259), ('%', 260), ('&', 261), (\"'\", 262), ('(', 263), (')', 264), ('*', 265), ('+', 266), (',', 267), ('-', 268), ('.', 269), ('/', 270), ('0', 271), ('1', 272), ('2', 273), ('3', 274), ('4', 275), ('5', 276), ('6', 277), ('7', 278), ('8', 279), ('9', 280), (':', 281), (';', 282), ('<', 283), ('=', 284), ('>', 285), ('?', 286), ('@', 287), ('A', 288), ('B', 289), ('C', 290), ('D', 291), ('E', 292), ('F', 293), ('G', 294), ('H', 295), ('I', 296), ('J', 297), ('K', 298), ('L', 299), ('M', 300), ('N', 301), ('O', 302), ('P', 303), ('Q', 304), ('R', 305), ('S', 306), ('T', 307), ('U', 308), ('V', 309), ('W', 310), ('X', 311), ('Y', 312), ('Z', 313), ('[', 314), ('\\\\', 315), (']', 316), ('_', 317), ('`', 318), ('a', 319), ('b', 320), ('c', 321), ('d', 322), ('e', 323), ('f', 324), ('g', 325), ('h', 326), ('i', 327), ('j', 328), ('k', 329), ('l', 330), ('m', 331), ('n', 332), ('o', 333), ('p', 334), ('q', 335), ('r', 336), ('s', 337), ('t', 338), ('u', 339), ('v', 340), ('w', 341), ('x', 342), ('y', 343), ('z', 344), ('|', 345), ('~', 346), ('¡', 347), ('¢', 348), ('£', 349), ('¤', 350), ('¥', 351), ('¦', 352), ('§', 353), ('¨', 354), ('©', 355), ('ª', 356), ('«', 357), ('¬', 358), ('®', 359), ('¯', 360), ('°', 361), ('±', 362), ('²', 363), ('³', 364), ('´', 365), ('µ', 366), ('¶', 367), ('·', 368), ('¸', 369), ('¹', 370), ('º', 371), ('»', 372), ('¼', 373), ('½', 374), ('¾', 375), ('¿', 376), ('Â', 377), ('Ã', 378), ('Ä', 379), ('Å', 380), ('É', 381), ('Ë', 382), ('Ì', 383), ('Í', 384), ('Î', 385), ('Ï', 386), ('Ð', 387), ('Ñ', 388), ('Ö', 389), ('×', 390), ('à', 391), ('â', 392), ('ê', 393), ('ë', 394), ('ì', 395), ('í', 396), ('ï', 397), ('ð', 398), ('ă', 399), ('Ĉ', 400), ('ĉ', 401), ('Ċ', 402), ('Ġ', 403), ('Ģ', 404), ('ģ', 405), ('Ĥ', 406), ('ĥ', 407), ('Ħ', 408), ('ħ', 409), ('Ĩ', 410), ('ĩ', 411), ('Ī', 412), ('ī', 413), ('Ĭ', 414), ('ĭ', 415), ('Į', 416), ('į', 417), ('ı', 418), ('IJ', 419), ('ij', 420), ('Ĵ', 421), ('ĵ', 422), ('Ķ', 423), ('ķ', 424), ('ĸ', 425), ('Ĺ', 426), ('ĺ', 427), ('Ļ', 428), ('ļ', 429), ('Ľ', 430), ('ľ', 431), ('Ŀ', 432), ('ŀ', 433), ('Ł', 434), ('ł', 435), ('Ń', 436), ('Ġt', 437), ('Ġa', 438), ('in', 439), ('he', 440), ('re', 441), ('on', 442), ('er', 443), ('Ġthe', 444), ('Ġs', 445), ('Ġw', 446), ('at', 447), ('Ġo', 448), ('Ġc', 449), ('nd', 450), ('it', 451), ('ou', 452), ('or', 453), ('es', 454), ('is', 455), ('Ġf', 456), ('en', 457), ('Ġp', 458), ('ing', 459), ('Ġb', 460), ('an', 461), ('al', 462), ('Ġto', 463), ('Ġm', 464), ('ed', 465), ('ar', 466), ('Ġin', 467), ('Ġand', 468), ('Ġof', 469), ('Ġd', 470), ('ic', 471), ('le', 472), ('om', 473), ('Ġh', 474), ('ion', 475), ('as', 476), ('Ġth', 477), ('ll', 478), ('.Ċ', 479), ('Ġy', 480), ('ent', 481), ('Ġl', 482), ('ve', 483), ('ro', 484), ('Ġe', 485), ('Ġre', 486), ('Ġn', 487), ('st', 488), ('Ġyou', 489), ('Ġg', 490), ('ct', 491), ('et', 492), ('âĢ', 493), ('ly', 494), ('Ġis', 495), ('ĠI', 496), ('Ġfor', 497), ('Ġbe', 498), ('id', 499)]\n"
]
}
],
"source": [
"import dataclasses\n",
"import glob\n",
"import json\n",
"import logging\n",
"import sys\n",
"import time\n",
"from pathlib import Path\n",
"\n",
"import tokenizers\n",
"from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers\n",
"from tokenizers.models import BPE\n",
"from tokenizers.trainers import BpeTrainer\n",
"from tqdm import tqdm\n",
"\n",
"import bpeasy\n",
"from bpeasy.tokenizer import BPEasyTokenizer\n",
"\n",
"logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n",
"\n",
"\n",
"@dataclasses.dataclass\n",
"class TrainBPETokenizerArgs:\n",
" dataset: str = \"./data\"\n",
" vocab_size: int = 500\n",
" max_sentencepiece_length: int = 64\n",
" regex_pattern: str = r\"\"\"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+\"\"\"\n",
"\n",
" def __post_init__(self):\n",
" checkpoint_dir = Path(self.dataset)\n",
" assert checkpoint_dir.is_dir(), checkpoint_dir\n",
"\n",
"\n",
"def jsonl_content_iterator(\n",
" args: TrainBPETokenizerArgs,\n",
"):\n",
" \"\"\"\n",
" Iterates over a jsonl file and yields the content of each line\n",
" Tracks the number of characters yielded and stops when the limit is reached\n",
" This is ripe for optimisation if you want to mess with more fine-grained\n",
" character limits (eg. more Python than Java)\n",
" \"\"\"\n",
" file_path = args.dataset\n",
" chunk_num, character_count = 0, 0\n",
" chunks = glob.glob(f\"{file_path}/*.jsonl\")\n",
"\n",
" while chunk_num < len(chunks):\n",
" file_name = chunks[chunk_num]\n",
" with open(file_name, \"r\", encoding=\"utf-8\") as f:\n",
" for line in f:\n",
" obj = json.loads(line)\n",
" text = obj[\"text\"]\n",
" text_character_count = len(text)\n",
" character_count += text_character_count\n",
" yield text\n",
" chunk_num += 1\n",
"\n",
"\n",
"def train_huggingface(args: TrainBPETokenizerArgs):\n",
" # should be at least 0.14.0 to train with char limit\n",
" assert tokenizers.__version__ >= \"0.14.0\"\n",
" tokenizer = Tokenizer(BPE(byte_fallback=True))\n",
" trainer = BpeTrainer(\n",
" vocab_size=args.vocab_size,\n",
" special_tokens=[f\"<0x{i:02X}>\" for i in range(256)], # seed sm vocab\n",
" max_token_length=args.max_sentencepiece_length,\n",
" show_progress=False,\n",
" )\n",
" gpt_regex = Regex(args.regex_pattern)\n",
"\n",
" split_pre_tokenizer = pre_tokenizers.Split(\n",
" gpt_regex, behavior=\"isolated\", invert=False\n",
" )\n",
" byte_pre_tokenizer = pre_tokenizers.ByteLevel(\n",
" add_prefix_space=False, use_regex=False\n",
" )\n",
" tokenizer.pre_tokenizer = pre_tokenizers.Sequence(\n",
" [split_pre_tokenizer, byte_pre_tokenizer]\n",
" )\n",
" # Use ByteLevel Decoder\n",
" tokenizer.decoder = decoders.Sequence(\n",
" [decoders.ByteLevel(), decoders.ByteFallback()]\n",
" )\n",
" iterator = jsonl_content_iterator(args)\n",
" # training the tokenizer\n",
" tokenizer.train_from_iterator(iterator, trainer)\n",
"\n",
" return tokenizer\n",
"\n",
"\n",
"def train_bpeasy(args: TrainBPETokenizerArgs):\n",
" # Use ByteLevel Decoder\n",
" iterator = jsonl_content_iterator(args)\n",
" # training the tokenizer\n",
" vocab = bpeasy.train_bpe(\n",
" iterator,\n",
" args.regex_pattern,\n",
" args.max_sentencepiece_length,\n",
" args.vocab_size,\n",
" )\n",
"\n",
" return BPEasyTokenizer(\n",
" vocab,\n",
" args.regex_pattern,\n",
" special_tokens=[],\n",
" fill_to_nearest_multiple_of_eight=False,\n",
" )\n",
"\n",
"\n",
"def encode(tokenizer, args) -> float:\n",
" iterator = jsonl_content_iterator(args)\n",
" lengths = []\n",
" for text in iterator:\n",
" encoded = tokenizer.encode(text)\n",
" lengths.append(len(encoded))\n",
" return sum(lengths)\n",
"\n",
"\n",
"def get_mean_std_dev(times: list[float]) -> tuple[float, float]:\n",
" avg_time = sum(times) / len(times)\n",
" std_dev = sum([(t - avg_time) ** 2 for t in times])\n",
" return avg_time, std_dev\n",
"\n",
"\n",
"NUM_ITERATIONS = 1\n",
"args = TrainBPETokenizerArgs()\n",
"\n",
"times_train_huggingface = []\n",
"times_encode_huggingface = []\n",
"times_train_bpeasy = []\n",
"times_encode_bpeasy = []\n",
"lengths_huggingface = []\n",
"lengths_bpeasy = []\n",
"\n",
"\n",
"time_now = time.time()\n",
"hf_tokenizer = train_huggingface(args)\n",
"print(sorted(hf_tokenizer.get_vocab().items(), key=lambda x: x[1])[255:])\n",
"times_train_huggingface.append(time.time() - time_now)\n",
"\n",
"time_now = time.time()\n",
"lengths_huggingface.append(encode(hf_tokenizer, args))\n",
"times_encode_huggingface.append(time.time() - time_now)\n",
"\n",
"time_now = time.time()\n",
"bpeasy_tokenizer = train_bpeasy(args)\n",
"print(sorted(bpeasy_tokenizer.vocab.items(), key=lambda x: x[1])[255:])\n",
"times_train_bpeasy.append(time.time() - time_now)\n",
"\n",
"time_now = time.time()\n",
"lengths_bpeasy.append(encode(bpeasy_tokenizer, args))\n",
"times_encode_bpeasy.append(time.time() - time_now)\n",
"\n",
"m_hf, std_hf = get_mean_std_dev(times_train_huggingface)\n",
"m_bpeasy, std_bpeasy = get_mean_std_dev(times_train_bpeasy)\n",
"\n",
"print(f\"huggingface train time {m_hf} +/- {std_hf}\")\n",
"print(f\"bpeasy train time {m_bpeasy} +/- {std_bpeasy}\")\n",
"\n",
"m_hf, std_hf = get_mean_std_dev(times_encode_huggingface)\n",
"m_bpeasy, std_bpeasy = get_mean_std_dev(times_encode_bpeasy)\n",
"\n",
"print(f\"huggingface encode time {m_hf} +/- {std_hf}\")\n",
"print(f\"bpeasy encode time {m_bpeasy} +/- {std_bpeasy}\")\n",
"\n",
"m_hf, std_hf = get_mean_std_dev(lengths_huggingface)\n",
"m_bpeasy, std_bpeasy = get_mean_std_dev(lengths_bpeasy)\n",
"\n",
"print(f\"huggingface length {m_hf} +/- {std_hf}\")\n",
"print(f\"bpeasy length {m_bpeasy} +/- {std_bpeasy}\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Deployed from e27d7a207f. You are on web.3. UTC time is currently 23 Apr 2019 15:02:46 +00:00.\n",
"['D', 'e', 'p', 'l', 'o', 'y', 'ed', 'Ġf', 'r', 'om', 'Ġe', '2', '7', 'd', '7', 'a', '2', '0', '7', 'f', '.', 'Ġ', 'Y', 'ou', 'Ġa', 're', 'Ġ', 'on', 'Ġw', 'e', 'b', '.', '3', '.', 'Ġ', 'U', 'T', 'C', 'Ġt', 'i', 'm', 'e', 'Ġis', 'Ġc', 'u', 'r', 're', 'n', 't', 'ly', 'Ġ', '2', '3', 'Ġ', 'A', 'p', 'r', 'Ġ', '2', '0', '1', '9', 'Ġ', '1', '5', ':', '0', '2', ':', '4', '6', 'Ġ', '+', '0', '0', ':', '0', '0', '.']\n",
"['D', 'e', 'p', 'l', 'o', 'y', 'ed', ' ', 'f', 'r', 'o', 'm', ' ', 'e', '2', '7', 'd', '7', 'a', '2', '0', '7', 'f', '.', ' ', 'Y', 'o', 'u', ' are', ' on', ' ', 'w', 'e', 'b', '.', '3', '.', ' ', 'U', 'T', 'C', ' ', 't', 'i', 'm', 'e', ' is', ' ', 'c', 'u', 'r', 'r', 'ent', 'l', 'y', ' ', '2', '3', ' A', 'p', 'r', ' ', '2', '0', '1', '9', ' ', '1', '5', ':', '0', '2', ':', '4', '6', ' ', '+', '0', '0', ':', '0', '0', '.']\n"
]
}
],
"source": [
"iterator = jsonl_content_iterator(args)\n",
"\n",
"for text in iterator:\n",
" if (\n",
" len(hf_tokenizer.encode(text)) < len(bpeasy_tokenizer.encode(text))\n",
" and len(text) < 100\n",
" ):\n",
" print(text)\n",
" print(hf_tokenizer.encode(text).tokens)\n",
" print([bpeasy_tokenizer.decode([t]) for t in bpeasy_tokenizer.encode(text)])\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['D',\n",
" 'e',\n",
" 'p',\n",
" 'l',\n",
" 'o',\n",
" 'y',\n",
" 'ed',\n",
" 'Ġf',\n",
" 'r',\n",
" 'om',\n",
" 'Ġe',\n",
" '2',\n",
" '7',\n",
" 'd',\n",
" '7',\n",
" 'a',\n",
" '2',\n",
" '0',\n",
" '7',\n",
" 'f',\n",
" '.',\n",
" 'Ġ',\n",
" 'Y',\n",
" 'ou',\n",
" 'Ġa',\n",
" 're',\n",
" 'Ġ',\n",
" 'on',\n",
" 'Ġw',\n",
" 'e',\n",
" 'b',\n",
" '.',\n",
" '3',\n",
" '.',\n",
" 'Ġ',\n",
" 'U',\n",
" 'T',\n",
" 'C',\n",
" 'Ġt',\n",
" 'i',\n",
" 'm',\n",
" 'e',\n",
" 'Ġis',\n",
" 'Ġc',\n",
" 'u',\n",
" 'r',\n",
" 're',\n",
" 'n',\n",
" 't',\n",
" 'ly',\n",
" 'Ġ',\n",
" '2',\n",
" '3',\n",
" 'Ġ',\n",
" 'A',\n",
" 'p',\n",
" 'r',\n",
" 'Ġ',\n",
" '2',\n",
" '0',\n",
" '1',\n",
" '9',\n",
" 'Ġ',\n",
" '1',\n",
" '5',\n",
" ':',\n",
" '0',\n",
" '2',\n",
" ':',\n",
" '4',\n",
" '6',\n",
" 'Ġ',\n",
" '+',\n",
" '0',\n",
" '0',\n",
" ':',\n",
" '0',\n",
" '0',\n",
" '.']"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import \n",
"iterator = jsonl_content_iterator(args)\n",
"vocab = bpeasy.train_bpe(\n",
" iterator,\n",
" args.regex_pattern,\n",
" args.max_sentencepiece_length,\n",
" args.vocab_size,\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "bpeasy-arm",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit ec1a20a

Please sign in to comment.