fix bug in lib and expand benchmarks

gautierdag · Dec 14, 2023 · ec1a20a · ec1a20a
1 parent 639b996
commit ec1a20a
Show file tree

Hide file tree

Showing 7 changed files with 376 additions and 396 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![codecov](https://codecov.io/gh/gautierdag/bpeasy/branch/main/graph/badge.svg?token=NWHDJ22L8I)](https://codecov.io/gh/gautierdag/bpeasy)
 
-[![tests](https://github.com/gautierdag/bpeasy/actions/workflows/test.yml/badge.svg)](https://github.com/gautierdag/bpeasy/actions/workflows/test.yml)
+[![tests](https://github.com/gautierdag/bpeasy/actions/workflows/CI.yml/badge.svg)](https://github.com/gautierdag/bpeasy/actions/workflows/CI.yml)
 [![image](https://img.shields.io/pypi/v/bpeasy.svg)](https://pypi.python.org/pypi/bpeasy)
 [![image](https://img.shields.io/pypi/l/bpeasy.svg)](https://pypi.python.org/pypi/bpeasy)
 [![image](https://img.shields.io/pypi/pyversions/bpeasy.svg)](https://pypi.python.org/pypi/bpeasy)

diff --git a/benchmarks/notebook.ipynb b/benchmarks/notebook.ipynb
@@ -0,0 +1,344 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('<0xFF>', 255), ('!', 256), ('\"', 257), ('#', 258), ('$', 259), ('%', 260), ('&', 261), (\"'\", 262), ('(', 263), (')', 264), ('*', 265), ('+', 266), (',', 267), ('-', 268), ('.', 269), ('/', 270), ('0', 271), ('1', 272), ('2', 273), ('3', 274), ('4', 275), ('5', 276), ('6', 277), ('7', 278), ('8', 279), ('9', 280), (':', 281), (';', 282), ('<', 283), ('=', 284), ('>', 285), ('?', 286), ('@', 287), ('A', 288), ('B', 289), ('C', 290), ('D', 291), ('E', 292), ('F', 293), ('G', 294), ('H', 295), ('I', 296), ('J', 297), ('K', 298), ('L', 299), ('M', 300), ('N', 301), ('O', 302), ('P', 303), ('Q', 304), ('R', 305), ('S', 306), ('T', 307), ('U', 308), ('V', 309), ('W', 310), ('X', 311), ('Y', 312), ('Z', 313), ('[', 314), ('\\\\', 315), (']', 316), ('_', 317), ('`', 318), ('a', 319), ('b', 320), ('c', 321), ('d', 322), ('e', 323), ('f', 324), ('g', 325), ('h', 326), ('i', 327), ('j', 328), ('k', 329), ('l', 330), ('m', 331), ('n', 332), ('o', 333), ('p', 334), ('q', 335), ('r', 336), ('s', 337), ('t', 338), ('u', 339), ('v', 340), ('w', 341), ('x', 342), ('y', 343), ('z', 344), ('|', 345), ('~', 346), ('¡', 347), ('¢', 348), ('£', 349), ('¤', 350), ('¥', 351), ('¦', 352), ('§', 353), ('¨', 354), ('©', 355), ('ª', 356), ('«', 357), ('¬', 358), ('®', 359), ('¯', 360), ('°', 361), ('±', 362), ('²', 363), ('³', 364), ('´', 365), ('µ', 366), ('¶', 367), ('·', 368), ('¸', 369), ('¹', 370), ('º', 371), ('»', 372), ('¼', 373), ('½', 374), ('¾', 375), ('¿', 376), ('Â', 377), ('Ã', 378), ('Ä', 379), ('Å', 380), ('É', 381), ('Ë', 382), ('Ì', 383), ('Í', 384), ('Î', 385), ('Ï', 386), ('Ð', 387), ('Ñ', 388), ('Ö', 389), ('×', 390), ('à', 391), ('â', 392), ('ê', 393), ('ë', 394), ('ì', 395), ('í', 396), ('ï', 397), ('ð', 398), ('ă', 399), ('Ĉ', 400), ('ĉ', 401), ('Ċ', 402), ('Ġ', 403), ('Ģ', 404), ('ģ', 405), ('Ĥ', 406), ('ĥ', 407), ('Ħ', 408), ('ħ', 409), ('Ĩ', 410), ('ĩ', 411), ('Ī', 412), ('ī', 413), ('Ĭ', 414), ('ĭ', 415), ('Į', 416), ('į', 417), ('ı', 418), ('Ĳ', 419), ('ĳ', 420), ('Ĵ', 421), ('ĵ', 422), ('Ķ', 423), ('ķ', 424), ('ĸ', 425), ('Ĺ', 426), ('ĺ', 427), ('Ļ', 428), ('ļ', 429), ('Ľ', 430), ('ľ', 431), ('Ŀ', 432), ('ŀ', 433), ('Ł', 434), ('ł', 435), ('Ń', 436), ('Ġt', 437), ('Ġa', 438), ('in', 439), ('he', 440), ('re', 441), ('on', 442), ('er', 443), ('Ġthe', 444), ('Ġs', 445), ('Ġw', 446), ('at', 447), ('Ġo', 448), ('Ġc', 449), ('nd', 450), ('it', 451), ('ou', 452), ('or', 453), ('es', 454), ('is', 455), ('Ġf', 456), ('en', 457), ('Ġp', 458), ('ing', 459), ('Ġb', 460), ('an', 461), ('al', 462), ('Ġto', 463), ('Ġm', 464), ('ed', 465), ('ar', 466), ('Ġin', 467), ('Ġand', 468), ('Ġof', 469), ('Ġd', 470), ('ic', 471), ('le', 472), ('om', 473), ('Ġh', 474), ('ion', 475), ('as', 476), ('Ġth', 477), ('ll', 478), ('.Ċ', 479), ('Ġy', 480), ('ent', 481), ('Ġl', 482), ('ve', 483), ('ro', 484), ('Ġe', 485), ('Ġre', 486), ('Ġn', 487), ('st', 488), ('Ġyou', 489), ('Ġg', 490), ('ct', 491), ('et', 492), ('âĢ', 493), ('ly', 494), ('Ġis', 495), ('ĠI', 496), ('Ġfor', 497), ('Ġbe', 498), ('id', 499)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import dataclasses\n",
+    "import glob\n",
+    "import json\n",
+    "import logging\n",
+    "import sys\n",
+    "import time\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import tokenizers\n",
+    "from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers\n",
+    "from tokenizers.models import BPE\n",
+    "from tokenizers.trainers import BpeTrainer\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "import bpeasy\n",
+    "from bpeasy.tokenizer import BPEasyTokenizer\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n",
+    "\n",
+    "\n",
+    "@dataclasses.dataclass\n",
+    "class TrainBPETokenizerArgs:\n",
+    "    dataset: str = \"./data\"\n",
+    "    vocab_size: int = 500\n",
+    "    max_sentencepiece_length: int = 64\n",
+    "    regex_pattern: str = r\"\"\"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+\"\"\"\n",
+    "\n",
+    "    def __post_init__(self):\n",
+    "        checkpoint_dir = Path(self.dataset)\n",
+    "        assert checkpoint_dir.is_dir(), checkpoint_dir\n",
+    "\n",
+    "\n",
+    "def jsonl_content_iterator(\n",
+    "    args: TrainBPETokenizerArgs,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Iterates over a jsonl file and yields the content of each line\n",
+    "    Tracks the number of characters yielded and stops when the limit is reached\n",
+    "    This is ripe for optimisation if you want to mess with more fine-grained\n",
+    "    character limits (eg. more Python than Java)\n",
+    "    \"\"\"\n",
+    "    file_path = args.dataset\n",
+    "    chunk_num, character_count = 0, 0\n",
+    "    chunks = glob.glob(f\"{file_path}/*.jsonl\")\n",
+    "\n",
+    "    while chunk_num < len(chunks):\n",
+    "        file_name = chunks[chunk_num]\n",
+    "        with open(file_name, \"r\", encoding=\"utf-8\") as f:\n",
+    "            for line in f:\n",
+    "                obj = json.loads(line)\n",
+    "                text = obj[\"text\"]\n",
+    "                text_character_count = len(text)\n",
+    "                character_count += text_character_count\n",
+    "                yield text\n",
+    "        chunk_num += 1\n",
+    "\n",
+    "\n",
+    "def train_huggingface(args: TrainBPETokenizerArgs):\n",
+    "    # should be at least 0.14.0 to train with char limit\n",
+    "    assert tokenizers.__version__ >= \"0.14.0\"\n",
+    "    tokenizer = Tokenizer(BPE(byte_fallback=True))\n",
+    "    trainer = BpeTrainer(\n",
+    "        vocab_size=args.vocab_size,\n",
+    "        special_tokens=[f\"<0x{i:02X}>\" for i in range(256)],  # seed sm vocab\n",
+    "        max_token_length=args.max_sentencepiece_length,\n",
+    "        show_progress=False,\n",
+    "    )\n",
+    "    gpt_regex = Regex(args.regex_pattern)\n",
+    "\n",
+    "    split_pre_tokenizer = pre_tokenizers.Split(\n",
+    "        gpt_regex, behavior=\"isolated\", invert=False\n",
+    "    )\n",
+    "    byte_pre_tokenizer = pre_tokenizers.ByteLevel(\n",
+    "        add_prefix_space=False, use_regex=False\n",
+    "    )\n",
+    "    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(\n",
+    "        [split_pre_tokenizer, byte_pre_tokenizer]\n",
+    "    )\n",
+    "    # Use ByteLevel Decoder\n",
+    "    tokenizer.decoder = decoders.Sequence(\n",
+    "        [decoders.ByteLevel(), decoders.ByteFallback()]\n",
+    "    )\n",
+    "    iterator = jsonl_content_iterator(args)\n",
+    "    # training the tokenizer\n",
+    "    tokenizer.train_from_iterator(iterator, trainer)\n",
+    "\n",
+    "    return tokenizer\n",
+    "\n",
+    "\n",
+    "def train_bpeasy(args: TrainBPETokenizerArgs):\n",
+    "    # Use ByteLevel Decoder\n",
+    "    iterator = jsonl_content_iterator(args)\n",
+    "    # training the tokenizer\n",
+    "    vocab = bpeasy.train_bpe(\n",
+    "        iterator,\n",
+    "        args.regex_pattern,\n",
+    "        args.max_sentencepiece_length,\n",
+    "        args.vocab_size,\n",
+    "    )\n",
+    "\n",
+    "    return BPEasyTokenizer(\n",
+    "        vocab,\n",
+    "        args.regex_pattern,\n",
+    "        special_tokens=[],\n",
+    "        fill_to_nearest_multiple_of_eight=False,\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def encode(tokenizer, args) -> float:\n",
+    "    iterator = jsonl_content_iterator(args)\n",
+    "    lengths = []\n",
+    "    for text in iterator:\n",
+    "        encoded = tokenizer.encode(text)\n",
+    "        lengths.append(len(encoded))\n",
+    "    return sum(lengths)\n",
+    "\n",
+    "\n",
+    "def get_mean_std_dev(times: list[float]) -> tuple[float, float]:\n",
+    "    avg_time = sum(times) / len(times)\n",
+    "    std_dev = sum([(t - avg_time) ** 2 for t in times])\n",
+    "    return avg_time, std_dev\n",
+    "\n",
+    "\n",
+    "NUM_ITERATIONS = 1\n",
+    "args = TrainBPETokenizerArgs()\n",
+    "\n",
+    "times_train_huggingface = []\n",
+    "times_encode_huggingface = []\n",
+    "times_train_bpeasy = []\n",
+    "times_encode_bpeasy = []\n",
+    "lengths_huggingface = []\n",
+    "lengths_bpeasy = []\n",
+    "\n",
+    "\n",
+    "time_now = time.time()\n",
+    "hf_tokenizer = train_huggingface(args)\n",
+    "print(sorted(hf_tokenizer.get_vocab().items(), key=lambda x: x[1])[255:])\n",
+    "times_train_huggingface.append(time.time() - time_now)\n",
+    "\n",
+    "time_now = time.time()\n",
+    "lengths_huggingface.append(encode(hf_tokenizer, args))\n",
+    "times_encode_huggingface.append(time.time() - time_now)\n",
+    "\n",
+    "time_now = time.time()\n",
+    "bpeasy_tokenizer = train_bpeasy(args)\n",
+    "print(sorted(bpeasy_tokenizer.vocab.items(), key=lambda x: x[1])[255:])\n",
+    "times_train_bpeasy.append(time.time() - time_now)\n",
+    "\n",
+    "time_now = time.time()\n",
+    "lengths_bpeasy.append(encode(bpeasy_tokenizer, args))\n",
+    "times_encode_bpeasy.append(time.time() - time_now)\n",
+    "\n",
+    "m_hf, std_hf = get_mean_std_dev(times_train_huggingface)\n",
+    "m_bpeasy, std_bpeasy = get_mean_std_dev(times_train_bpeasy)\n",
+    "\n",
+    "print(f\"huggingface train time {m_hf} +/- {std_hf}\")\n",
+    "print(f\"bpeasy train time {m_bpeasy} +/- {std_bpeasy}\")\n",
+    "\n",
+    "m_hf, std_hf = get_mean_std_dev(times_encode_huggingface)\n",
+    "m_bpeasy, std_bpeasy = get_mean_std_dev(times_encode_bpeasy)\n",
+    "\n",
+    "print(f\"huggingface encode time {m_hf} +/- {std_hf}\")\n",
+    "print(f\"bpeasy encode time {m_bpeasy} +/- {std_bpeasy}\")\n",
+    "\n",
+    "m_hf, std_hf = get_mean_std_dev(lengths_huggingface)\n",
+    "m_bpeasy, std_bpeasy = get_mean_std_dev(lengths_bpeasy)\n",
+    "\n",
+    "print(f\"huggingface length {m_hf} +/- {std_hf}\")\n",
+    "print(f\"bpeasy length {m_bpeasy} +/- {std_bpeasy}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Deployed from e27d7a207f. You are on web.3. UTC time is currently 23 Apr 2019 15:02:46 +00:00.\n",
+      "['D', 'e', 'p', 'l', 'o', 'y', 'ed', 'Ġf', 'r', 'om', 'Ġe', '2', '7', 'd', '7', 'a', '2', '0', '7', 'f', '.', 'Ġ', 'Y', 'ou', 'Ġa', 're', 'Ġ', 'on', 'Ġw', 'e', 'b', '.', '3', '.', 'Ġ', 'U', 'T', 'C', 'Ġt', 'i', 'm', 'e', 'Ġis', 'Ġc', 'u', 'r', 're', 'n', 't', 'ly', 'Ġ', '2', '3', 'Ġ', 'A', 'p', 'r', 'Ġ', '2', '0', '1', '9', 'Ġ', '1', '5', ':', '0', '2', ':', '4', '6', 'Ġ', '+', '0', '0', ':', '0', '0', '.']\n",
+      "['D', 'e', 'p', 'l', 'o', 'y', 'ed', ' ', 'f', 'r', 'o', 'm', ' ', 'e', '2', '7', 'd', '7', 'a', '2', '0', '7', 'f', '.', ' ', 'Y', 'o', 'u', ' are', ' on', ' ', 'w', 'e', 'b', '.', '3', '.', ' ', 'U', 'T', 'C', ' ', 't', 'i', 'm', 'e', ' is', ' ', 'c', 'u', 'r', 'r', 'ent', 'l', 'y', ' ', '2', '3', ' A', 'p', 'r', ' ', '2', '0', '1', '9', ' ', '1', '5', ':', '0', '2', ':', '4', '6', ' ', '+', '0', '0', ':', '0', '0', '.']\n"
+     ]
+    }
+   ],
+   "source": [
+    "iterator = jsonl_content_iterator(args)\n",
+    "\n",
+    "for text in iterator:\n",
+    "    if (\n",
+    "        len(hf_tokenizer.encode(text)) < len(bpeasy_tokenizer.encode(text))\n",
+    "        and len(text) < 100\n",
+    "    ):\n",
+    "        print(text)\n",
+    "        print(hf_tokenizer.encode(text).tokens)\n",
+    "        print([bpeasy_tokenizer.decode([t]) for t in bpeasy_tokenizer.encode(text)])\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['D',\n",
+       " 'e',\n",
+       " 'p',\n",
+       " 'l',\n",
+       " 'o',\n",
+       " 'y',\n",
+       " 'ed',\n",
+       " 'Ġf',\n",
+       " 'r',\n",
+       " 'om',\n",
+       " 'Ġe',\n",
+       " '2',\n",
+       " '7',\n",
+       " 'd',\n",
+       " '7',\n",
+       " 'a',\n",
+       " '2',\n",
+       " '0',\n",
+       " '7',\n",
+       " 'f',\n",
+       " '.',\n",
+       " 'Ġ',\n",
+       " 'Y',\n",
+       " 'ou',\n",
+       " 'Ġa',\n",
+       " 're',\n",
+       " 'Ġ',\n",
+       " 'on',\n",
+       " 'Ġw',\n",
+       " 'e',\n",
+       " 'b',\n",
+       " '.',\n",
+       " '3',\n",
+       " '.',\n",
+       " 'Ġ',\n",
+       " 'U',\n",
+       " 'T',\n",
+       " 'C',\n",
+       " 'Ġt',\n",
+       " 'i',\n",
+       " 'm',\n",
+       " 'e',\n",
+       " 'Ġis',\n",
+       " 'Ġc',\n",
+       " 'u',\n",
+       " 'r',\n",
+       " 're',\n",
+       " 'n',\n",
+       " 't',\n",
+       " 'ly',\n",
+       " 'Ġ',\n",
+       " '2',\n",
+       " '3',\n",
+       " 'Ġ',\n",
+       " 'A',\n",
+       " 'p',\n",
+       " 'r',\n",
+       " 'Ġ',\n",
+       " '2',\n",
+       " '0',\n",
+       " '1',\n",
+       " '9',\n",
+       " 'Ġ',\n",
+       " '1',\n",
+       " '5',\n",
+       " ':',\n",
+       " '0',\n",
+       " '2',\n",
+       " ':',\n",
+       " '4',\n",
+       " '6',\n",
+       " 'Ġ',\n",
+       " '+',\n",
+       " '0',\n",
+       " '0',\n",
+       " ':',\n",
+       " '0',\n",
+       " '0',\n",
+       " '.']"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import \n",
+    "iterator = jsonl_content_iterator(args)\n",
+    "vocab = bpeasy.train_bpe(\n",
+    "    iterator,\n",
+    "    args.regex_pattern,\n",
+    "    args.max_sentencepiece_length,\n",
+    "    args.vocab_size,\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "bpeasy-arm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}