-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix bug in lib and expand benchmarks
- Loading branch information
1 parent
639b996
commit ec1a20a
Showing
7 changed files
with
376 additions
and
396 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,344 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[('<0xFF>', 255), ('!', 256), ('\"', 257), ('#', 258), ('$', 259), ('%', 260), ('&', 261), (\"'\", 262), ('(', 263), (')', 264), ('*', 265), ('+', 266), (',', 267), ('-', 268), ('.', 269), ('/', 270), ('0', 271), ('1', 272), ('2', 273), ('3', 274), ('4', 275), ('5', 276), ('6', 277), ('7', 278), ('8', 279), ('9', 280), (':', 281), (';', 282), ('<', 283), ('=', 284), ('>', 285), ('?', 286), ('@', 287), ('A', 288), ('B', 289), ('C', 290), ('D', 291), ('E', 292), ('F', 293), ('G', 294), ('H', 295), ('I', 296), ('J', 297), ('K', 298), ('L', 299), ('M', 300), ('N', 301), ('O', 302), ('P', 303), ('Q', 304), ('R', 305), ('S', 306), ('T', 307), ('U', 308), ('V', 309), ('W', 310), ('X', 311), ('Y', 312), ('Z', 313), ('[', 314), ('\\\\', 315), (']', 316), ('_', 317), ('`', 318), ('a', 319), ('b', 320), ('c', 321), ('d', 322), ('e', 323), ('f', 324), ('g', 325), ('h', 326), ('i', 327), ('j', 328), ('k', 329), ('l', 330), ('m', 331), ('n', 332), ('o', 333), ('p', 334), ('q', 335), ('r', 336), ('s', 337), ('t', 338), ('u', 339), ('v', 340), ('w', 341), ('x', 342), ('y', 343), ('z', 344), ('|', 345), ('~', 346), ('¡', 347), ('¢', 348), ('£', 349), ('¤', 350), ('¥', 351), ('¦', 352), ('§', 353), ('¨', 354), ('©', 355), ('ª', 356), ('«', 357), ('¬', 358), ('®', 359), ('¯', 360), ('°', 361), ('±', 362), ('²', 363), ('³', 364), ('´', 365), ('µ', 366), ('¶', 367), ('·', 368), ('¸', 369), ('¹', 370), ('º', 371), ('»', 372), ('¼', 373), ('½', 374), ('¾', 375), ('¿', 376), ('Â', 377), ('Ã', 378), ('Ä', 379), ('Å', 380), ('É', 381), ('Ë', 382), ('Ì', 383), ('Í', 384), ('Î', 385), ('Ï', 386), ('Ð', 387), ('Ñ', 388), ('Ö', 389), ('×', 390), ('à', 391), ('â', 392), ('ê', 393), ('ë', 394), ('ì', 395), ('í', 396), ('ï', 397), ('ð', 398), ('ă', 399), ('Ĉ', 400), ('ĉ', 401), ('Ċ', 402), ('Ġ', 403), ('Ģ', 404), ('ģ', 405), ('Ĥ', 406), ('ĥ', 407), ('Ħ', 408), ('ħ', 409), ('Ĩ', 410), ('ĩ', 411), ('Ī', 412), ('ī', 413), ('Ĭ', 414), ('ĭ', 415), ('Į', 416), ('į', 417), ('ı', 418), ('IJ', 419), ('ij', 420), ('Ĵ', 421), ('ĵ', 422), ('Ķ', 423), ('ķ', 424), ('ĸ', 425), ('Ĺ', 426), ('ĺ', 427), ('Ļ', 428), ('ļ', 429), ('Ľ', 430), ('ľ', 431), ('Ŀ', 432), ('ŀ', 433), ('Ł', 434), ('ł', 435), ('Ń', 436), ('Ġt', 437), ('Ġa', 438), ('in', 439), ('he', 440), ('re', 441), ('on', 442), ('er', 443), ('Ġthe', 444), ('Ġs', 445), ('Ġw', 446), ('at', 447), ('Ġo', 448), ('Ġc', 449), ('nd', 450), ('it', 451), ('ou', 452), ('or', 453), ('es', 454), ('is', 455), ('Ġf', 456), ('en', 457), ('Ġp', 458), ('ing', 459), ('Ġb', 460), ('an', 461), ('al', 462), ('Ġto', 463), ('Ġm', 464), ('ed', 465), ('ar', 466), ('Ġin', 467), ('Ġand', 468), ('Ġof', 469), ('Ġd', 470), ('ic', 471), ('le', 472), ('om', 473), ('Ġh', 474), ('ion', 475), ('as', 476), ('Ġth', 477), ('ll', 478), ('.Ċ', 479), ('Ġy', 480), ('ent', 481), ('Ġl', 482), ('ve', 483), ('ro', 484), ('Ġe', 485), ('Ġre', 486), ('Ġn', 487), ('st', 488), ('Ġyou', 489), ('Ġg', 490), ('ct', 491), ('et', 492), ('âĢ', 493), ('ly', 494), ('Ġis', 495), ('ĠI', 496), ('Ġfor', 497), ('Ġbe', 498), ('id', 499)]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import dataclasses\n", | ||
"import glob\n", | ||
"import json\n", | ||
"import logging\n", | ||
"import sys\n", | ||
"import time\n", | ||
"from pathlib import Path\n", | ||
"\n", | ||
"import tokenizers\n", | ||
"from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers\n", | ||
"from tokenizers.models import BPE\n", | ||
"from tokenizers.trainers import BpeTrainer\n", | ||
"from tqdm import tqdm\n", | ||
"\n", | ||
"import bpeasy\n", | ||
"from bpeasy.tokenizer import BPEasyTokenizer\n", | ||
"\n", | ||
"logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n", | ||
"\n", | ||
"\n", | ||
"@dataclasses.dataclass\n", | ||
"class TrainBPETokenizerArgs:\n", | ||
" dataset: str = \"./data\"\n", | ||
" vocab_size: int = 500\n", | ||
" max_sentencepiece_length: int = 64\n", | ||
" regex_pattern: str = r\"\"\"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+\"\"\"\n", | ||
"\n", | ||
" def __post_init__(self):\n", | ||
" checkpoint_dir = Path(self.dataset)\n", | ||
" assert checkpoint_dir.is_dir(), checkpoint_dir\n", | ||
"\n", | ||
"\n", | ||
"def jsonl_content_iterator(\n", | ||
" args: TrainBPETokenizerArgs,\n", | ||
"):\n", | ||
" \"\"\"\n", | ||
" Iterates over a jsonl file and yields the content of each line\n", | ||
" Tracks the number of characters yielded and stops when the limit is reached\n", | ||
" This is ripe for optimisation if you want to mess with more fine-grained\n", | ||
" character limits (eg. more Python than Java)\n", | ||
" \"\"\"\n", | ||
" file_path = args.dataset\n", | ||
" chunk_num, character_count = 0, 0\n", | ||
" chunks = glob.glob(f\"{file_path}/*.jsonl\")\n", | ||
"\n", | ||
" while chunk_num < len(chunks):\n", | ||
" file_name = chunks[chunk_num]\n", | ||
" with open(file_name, \"r\", encoding=\"utf-8\") as f:\n", | ||
" for line in f:\n", | ||
" obj = json.loads(line)\n", | ||
" text = obj[\"text\"]\n", | ||
" text_character_count = len(text)\n", | ||
" character_count += text_character_count\n", | ||
" yield text\n", | ||
" chunk_num += 1\n", | ||
"\n", | ||
"\n", | ||
"def train_huggingface(args: TrainBPETokenizerArgs):\n", | ||
" # should be at least 0.14.0 to train with char limit\n", | ||
" assert tokenizers.__version__ >= \"0.14.0\"\n", | ||
" tokenizer = Tokenizer(BPE(byte_fallback=True))\n", | ||
" trainer = BpeTrainer(\n", | ||
" vocab_size=args.vocab_size,\n", | ||
" special_tokens=[f\"<0x{i:02X}>\" for i in range(256)], # seed sm vocab\n", | ||
" max_token_length=args.max_sentencepiece_length,\n", | ||
" show_progress=False,\n", | ||
" )\n", | ||
" gpt_regex = Regex(args.regex_pattern)\n", | ||
"\n", | ||
" split_pre_tokenizer = pre_tokenizers.Split(\n", | ||
" gpt_regex, behavior=\"isolated\", invert=False\n", | ||
" )\n", | ||
" byte_pre_tokenizer = pre_tokenizers.ByteLevel(\n", | ||
" add_prefix_space=False, use_regex=False\n", | ||
" )\n", | ||
" tokenizer.pre_tokenizer = pre_tokenizers.Sequence(\n", | ||
" [split_pre_tokenizer, byte_pre_tokenizer]\n", | ||
" )\n", | ||
" # Use ByteLevel Decoder\n", | ||
" tokenizer.decoder = decoders.Sequence(\n", | ||
" [decoders.ByteLevel(), decoders.ByteFallback()]\n", | ||
" )\n", | ||
" iterator = jsonl_content_iterator(args)\n", | ||
" # training the tokenizer\n", | ||
" tokenizer.train_from_iterator(iterator, trainer)\n", | ||
"\n", | ||
" return tokenizer\n", | ||
"\n", | ||
"\n", | ||
"def train_bpeasy(args: TrainBPETokenizerArgs):\n", | ||
" # Use ByteLevel Decoder\n", | ||
" iterator = jsonl_content_iterator(args)\n", | ||
" # training the tokenizer\n", | ||
" vocab = bpeasy.train_bpe(\n", | ||
" iterator,\n", | ||
" args.regex_pattern,\n", | ||
" args.max_sentencepiece_length,\n", | ||
" args.vocab_size,\n", | ||
" )\n", | ||
"\n", | ||
" return BPEasyTokenizer(\n", | ||
" vocab,\n", | ||
" args.regex_pattern,\n", | ||
" special_tokens=[],\n", | ||
" fill_to_nearest_multiple_of_eight=False,\n", | ||
" )\n", | ||
"\n", | ||
"\n", | ||
"def encode(tokenizer, args) -> float:\n", | ||
" iterator = jsonl_content_iterator(args)\n", | ||
" lengths = []\n", | ||
" for text in iterator:\n", | ||
" encoded = tokenizer.encode(text)\n", | ||
" lengths.append(len(encoded))\n", | ||
" return sum(lengths)\n", | ||
"\n", | ||
"\n", | ||
"def get_mean_std_dev(times: list[float]) -> tuple[float, float]:\n", | ||
" avg_time = sum(times) / len(times)\n", | ||
" std_dev = sum([(t - avg_time) ** 2 for t in times])\n", | ||
" return avg_time, std_dev\n", | ||
"\n", | ||
"\n", | ||
"NUM_ITERATIONS = 1\n", | ||
"args = TrainBPETokenizerArgs()\n", | ||
"\n", | ||
"times_train_huggingface = []\n", | ||
"times_encode_huggingface = []\n", | ||
"times_train_bpeasy = []\n", | ||
"times_encode_bpeasy = []\n", | ||
"lengths_huggingface = []\n", | ||
"lengths_bpeasy = []\n", | ||
"\n", | ||
"\n", | ||
"time_now = time.time()\n", | ||
"hf_tokenizer = train_huggingface(args)\n", | ||
"print(sorted(hf_tokenizer.get_vocab().items(), key=lambda x: x[1])[255:])\n", | ||
"times_train_huggingface.append(time.time() - time_now)\n", | ||
"\n", | ||
"time_now = time.time()\n", | ||
"lengths_huggingface.append(encode(hf_tokenizer, args))\n", | ||
"times_encode_huggingface.append(time.time() - time_now)\n", | ||
"\n", | ||
"time_now = time.time()\n", | ||
"bpeasy_tokenizer = train_bpeasy(args)\n", | ||
"print(sorted(bpeasy_tokenizer.vocab.items(), key=lambda x: x[1])[255:])\n", | ||
"times_train_bpeasy.append(time.time() - time_now)\n", | ||
"\n", | ||
"time_now = time.time()\n", | ||
"lengths_bpeasy.append(encode(bpeasy_tokenizer, args))\n", | ||
"times_encode_bpeasy.append(time.time() - time_now)\n", | ||
"\n", | ||
"m_hf, std_hf = get_mean_std_dev(times_train_huggingface)\n", | ||
"m_bpeasy, std_bpeasy = get_mean_std_dev(times_train_bpeasy)\n", | ||
"\n", | ||
"print(f\"huggingface train time {m_hf} +/- {std_hf}\")\n", | ||
"print(f\"bpeasy train time {m_bpeasy} +/- {std_bpeasy}\")\n", | ||
"\n", | ||
"m_hf, std_hf = get_mean_std_dev(times_encode_huggingface)\n", | ||
"m_bpeasy, std_bpeasy = get_mean_std_dev(times_encode_bpeasy)\n", | ||
"\n", | ||
"print(f\"huggingface encode time {m_hf} +/- {std_hf}\")\n", | ||
"print(f\"bpeasy encode time {m_bpeasy} +/- {std_bpeasy}\")\n", | ||
"\n", | ||
"m_hf, std_hf = get_mean_std_dev(lengths_huggingface)\n", | ||
"m_bpeasy, std_bpeasy = get_mean_std_dev(lengths_bpeasy)\n", | ||
"\n", | ||
"print(f\"huggingface length {m_hf} +/- {std_hf}\")\n", | ||
"print(f\"bpeasy length {m_bpeasy} +/- {std_bpeasy}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 26, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Deployed from e27d7a207f. You are on web.3. UTC time is currently 23 Apr 2019 15:02:46 +00:00.\n", | ||
"['D', 'e', 'p', 'l', 'o', 'y', 'ed', 'Ġf', 'r', 'om', 'Ġe', '2', '7', 'd', '7', 'a', '2', '0', '7', 'f', '.', 'Ġ', 'Y', 'ou', 'Ġa', 're', 'Ġ', 'on', 'Ġw', 'e', 'b', '.', '3', '.', 'Ġ', 'U', 'T', 'C', 'Ġt', 'i', 'm', 'e', 'Ġis', 'Ġc', 'u', 'r', 're', 'n', 't', 'ly', 'Ġ', '2', '3', 'Ġ', 'A', 'p', 'r', 'Ġ', '2', '0', '1', '9', 'Ġ', '1', '5', ':', '0', '2', ':', '4', '6', 'Ġ', '+', '0', '0', ':', '0', '0', '.']\n", | ||
"['D', 'e', 'p', 'l', 'o', 'y', 'ed', ' ', 'f', 'r', 'o', 'm', ' ', 'e', '2', '7', 'd', '7', 'a', '2', '0', '7', 'f', '.', ' ', 'Y', 'o', 'u', ' are', ' on', ' ', 'w', 'e', 'b', '.', '3', '.', ' ', 'U', 'T', 'C', ' ', 't', 'i', 'm', 'e', ' is', ' ', 'c', 'u', 'r', 'r', 'ent', 'l', 'y', ' ', '2', '3', ' A', 'p', 'r', ' ', '2', '0', '1', '9', ' ', '1', '5', ':', '0', '2', ':', '4', '6', ' ', '+', '0', '0', ':', '0', '0', '.']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"iterator = jsonl_content_iterator(args)\n", | ||
"\n", | ||
"for text in iterator:\n", | ||
" if (\n", | ||
" len(hf_tokenizer.encode(text)) < len(bpeasy_tokenizer.encode(text))\n", | ||
" and len(text) < 100\n", | ||
" ):\n", | ||
" print(text)\n", | ||
" print(hf_tokenizer.encode(text).tokens)\n", | ||
" print([bpeasy_tokenizer.decode([t]) for t in bpeasy_tokenizer.encode(text)])\n", | ||
" break" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"['D',\n", | ||
" 'e',\n", | ||
" 'p',\n", | ||
" 'l',\n", | ||
" 'o',\n", | ||
" 'y',\n", | ||
" 'ed',\n", | ||
" 'Ġf',\n", | ||
" 'r',\n", | ||
" 'om',\n", | ||
" 'Ġe',\n", | ||
" '2',\n", | ||
" '7',\n", | ||
" 'd',\n", | ||
" '7',\n", | ||
" 'a',\n", | ||
" '2',\n", | ||
" '0',\n", | ||
" '7',\n", | ||
" 'f',\n", | ||
" '.',\n", | ||
" 'Ġ',\n", | ||
" 'Y',\n", | ||
" 'ou',\n", | ||
" 'Ġa',\n", | ||
" 're',\n", | ||
" 'Ġ',\n", | ||
" 'on',\n", | ||
" 'Ġw',\n", | ||
" 'e',\n", | ||
" 'b',\n", | ||
" '.',\n", | ||
" '3',\n", | ||
" '.',\n", | ||
" 'Ġ',\n", | ||
" 'U',\n", | ||
" 'T',\n", | ||
" 'C',\n", | ||
" 'Ġt',\n", | ||
" 'i',\n", | ||
" 'm',\n", | ||
" 'e',\n", | ||
" 'Ġis',\n", | ||
" 'Ġc',\n", | ||
" 'u',\n", | ||
" 'r',\n", | ||
" 're',\n", | ||
" 'n',\n", | ||
" 't',\n", | ||
" 'ly',\n", | ||
" 'Ġ',\n", | ||
" '2',\n", | ||
" '3',\n", | ||
" 'Ġ',\n", | ||
" 'A',\n", | ||
" 'p',\n", | ||
" 'r',\n", | ||
" 'Ġ',\n", | ||
" '2',\n", | ||
" '0',\n", | ||
" '1',\n", | ||
" '9',\n", | ||
" 'Ġ',\n", | ||
" '1',\n", | ||
" '5',\n", | ||
" ':',\n", | ||
" '0',\n", | ||
" '2',\n", | ||
" ':',\n", | ||
" '4',\n", | ||
" '6',\n", | ||
" 'Ġ',\n", | ||
" '+',\n", | ||
" '0',\n", | ||
" '0',\n", | ||
" ':',\n", | ||
" '0',\n", | ||
" '0',\n", | ||
" '.']" | ||
] | ||
}, | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"import \n", | ||
"iterator = jsonl_content_iterator(args)\n", | ||
"vocab = bpeasy.train_bpe(\n", | ||
" iterator,\n", | ||
" args.regex_pattern,\n", | ||
" args.max_sentencepiece_length,\n", | ||
" args.vocab_size,\n", | ||
")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "bpeasy-arm", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.0" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.