Add files via upload

ThuraAung1601 · Nov 22, 2024 · 5a6e96b · 5a6e96b
1 parent 0fcafe2
commit 5a6e96b
Show file tree

Hide file tree

Showing 15 changed files with 21,066 additions and 2 deletions.
diff --git a/Demo.ipynb b/Demo.ipynb
@@ -0,0 +1,236 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d5607014-b864-4018-be4d-2d3e12e71c0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading myWord dictionary ...\n",
+      "Download Completed.\n",
+      "['မြန်မာ', 'နိုင်ငံ', '။']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from myTokenize import WordTokenizer\n",
+    "\n",
+    "tokenizer = WordTokenizer(engine=\"myWord\")  # Use \"myWord\", \"CRF\", or \"LSTM\"\n",
+    "words = tokenizer.tokenize(\"မြန်မာနိုင်ငံ။\")\n",
+    "print(words)  # ['မြန်မာ', 'နိုင်ငံ', '။']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "84961933-5255-4988-9b48-2a898498db4d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading neural network models ...\n",
+      "LFS Not Found at Models/myWseg-s2-bilstm.h5\n",
+      "Download Completed.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.\n",
+      "WARNING:absl:There is a known slowdown when using v2.11+ Keras optimizers on M1/M2 Macs. Falling back to the legacy Keras optimizer, i.e., `tf.keras.optimizers.legacy.Adam`.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['မြန်မာ', 'နိုင်ငံ', '။']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from myTokenize import WordTokenizer\n",
+    "\n",
+    "tokenizer = WordTokenizer(engine=\"LSTM\")  # Use \"myWord\", \"CRF\", or \"LSTM\"\n",
+    "words = tokenizer.tokenize(\"မြန်မာနိုင်ငံ။\")\n",
+    "print(words)  # ['မြန်မာ', 'နိုင်ငံ', '။']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5931d157-dd07-4053-b27f-f404b7bdebff",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ရွေး', 'ကောက်', 'ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရမ့်']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from myTokenize import SyllableTokenizer\n",
+    "\n",
+    "tokenizer = SyllableTokenizer()\n",
+    "tokens = tokenizer.tokenize(\"ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်\")\n",
+    "print(tokens) # ['ရွေး', 'ကောက်', 'ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5d012c2c-282c-4872-a3a9-ee8ae0147b64",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရမ်', '့']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from myTokenize import BPETokenizer\n",
+    "\n",
+    "tokenizer = BPETokenizer()\n",
+    "tokens = tokenizer.tokenize(\"ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်\")\n",
+    "print(tokens)  # ['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1ce5ab3a-edc3-494c-ab74-7d816b02231f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ', '်', 'နယ်', 'ထ', 'ရမ်', '့']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from myTokenize import UnigramTokenizer\n",
+    "\n",
+    "tokenizer = UnigramTokenizer()\n",
+    "tokens = tokenizer.tokenize(\"ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်\")\n",
+    "print(tokens)  # ['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b37d314a-05df-462b-bfa5-43826c16e607",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['မြန်မာ', 'နိုင်ငံ', '။']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from myTokenize import WordTokenizer\n",
+    "\n",
+    "tokenizer = WordTokenizer(engine=\"CRF\")  # Use \"myWord\", \"CRF\", or \"LSTM\"\n",
+    "words = tokenizer.tokenize(\"မြန်မာနိုင်ငံ။\")\n",
+    "print(words)  # ['မြန်မာ', 'နိုင်ငံ', '။']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7e739be1-8847-4a19-b72d-719482ff5ae8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from myTokenize import PhraseTokenizer\n",
+    "\n",
+    "tokenizer = PhraseTokenizer()\n",
+    "phrases = tokenizer.tokenize(\"ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါ\")\n",
+    "print(phrases)  # ['ညာဘက်_ကို', 'ယူ', 'ပြီး_တော့', 'တည့်တည့်', 'သွား_ပါ']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "77629754-97ea-490e-9fbb-999afd86c401",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.\n",
+      "WARNING:absl:There is a known slowdown when using v2.11+ Keras optimizers on M1/M2 Macs. Falling back to the legacy Keras optimizer, i.e., `tf.keras.optimizers.legacy.Adam`.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ'], ['ခင်ဗျား', 'ငါး', 'မိနစ်', 'လောက်ကြာ', 'လိမ့်', 'မယ်']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from myTokenize import SentenceTokenizer\n",
+    "\n",
+    "tokenizer = SentenceTokenizer()\n",
+    "sentences = tokenizer.tokenize(\"ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါခင်ဗျားငါးမိနစ်လောက်ကြာလိမ့်မယ်\")\n",
+    "print(sentences)  # [['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ'], ['ခင်ဗျား', 'ငါး', 'မိနစ်', 'လောက်', 'ကြာ', 'လိမ့်', 'မယ်']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5545f19b-ff90-4fee-bada-96b1c1027526",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 June
+Copyright (c) 2024 Ye Kyaw Thu and Thura Aung
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include myTokenize/myWord *.py
diff --git a/README.md b/README.md
@@ -1 +1,113 @@
 # myTokenize
+
+**myTokenize** is a Python library that tokenizes Myanmar text into syllables, words, phrases, and sentences. It supports multiple tokenization techniques using rule-based, statistical, and neural network-based approaches.
+
+## Features
+
+- **Syllable Tokenization**: Break text into syllables using regex rules.
+- **BPE and Unigram Tokenization**: Leverage SentencePiece models for tokenization.
+- **Word Tokenization**: Segment text into words using:
+  - `myWord`: Dictionary-based tokenization.
+  - `CRF`: Conditional Random Fields-based tokenization.
+  - `BiLSTM`: Neural network-based tokenization.
+- **Phrase Tokenization**: Identify phrases in text using normalized pointwise mutual information (NPMI).
+- **Sentence Tokenization**: Use a BiLSTM model to segment text into sentences.
+
+## Installation
+
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/ThuraAung1601/myTokenize.git
+   cd myTokenize
+   ```
+
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Install the library:
+   ```bash
+   pip install .
+   ```
+
+## Usage
+
+### Syllable Tokenizer
+```python
+from myTokenize import SyllableTokenizer
+
+tokenizer = SyllableTokenizer()
+syllables = tokenizer.tokenize("မြန်မာနိုင်ငံ။")
+print(syllables)  # ['မြန်', 'မာ', 'နိုင်', 'ငံ', '။']
+```
+
+### BPE Tokenizer
+```python
+from myTokenize import BPETokenizer
+
+tokenizer = BPETokenizer()
+tokens = tokenizer.tokenize("ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်")
+print(tokens)  # ['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']
+```
+
+### Word Tokenizer
+```python
+from myTokenize import WordTokenizer
+
+tokenizer = WordTokenizer(engine="CRF")  # Use "myWord", "CRF", or "LSTM"
+words = tokenizer.tokenize("မြန်မာနိုင်ငံ။")
+print(words)  # ['မြန်မာ', 'နိုင်ငံ', '။']
+```
+
+### Phrase Tokenizer
+```python
+from myTokenize import PhraseTokenizer
+
+tokenizer = PhraseTokenizer()
+phrases = tokenizer.tokenize("ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါ")
+print(phrases)  # ['ညာဘက်_ကို', 'ယူ', 'ပြီး_တော့', 'တည့်တည့်', 'သွား_ပါ']
+```
+
+### Sentence Tokenizer
+```python
+from myTokenize import SentenceTokenizer
+
+tokenizer = SentenceTokenizer()
+sentences = tokenizer.tokenize("ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါခင်ဗျားငါးမိနစ်လောက်ကြာလိမ့်မယ်")
+print(sentences)  # [['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ'], ['ခင်ဗျား', 'ငါး', 'မိနစ်', 'လောက်', 'ကြာ', 'လိမ့်', 'မယ်']]
+```
+
+## Folder Structure
+
+```
+./myTokenize/
+├── CRFTokenizer
+│   └── wordseg_c2_crf.crfsuite
+├── SentencePiece
+│   ├── bpe_sentencepiece_model.model
+│   ├── bpe_sentencepiece_model.vocab
+│   ├── unigram_sentencepiece_model.model
+│   └── unigram_sentencepiece_model.vocab
+├── Tokenizer.py
+└── myWord
+    ├── phrase_segment.py
+    └── word_segment.py
+```
+
+## Dependencies
+
+- Python 3.7+
+- TensorFlow
+- SentencePiece
+- pycrfsuite
+- Numpy
+
+## License
+
+This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
+
+## Authors
+
+- **Ye Kyaw Thu** 
+- **Thura Aung**
diff --git a/myTokenize/CRFTokenizer/wordseg_c2_crf.crfsuite b/myTokenize/CRFTokenizer/wordseg_c2_crf.crfsuite
diff --git a/myTokenize/SentencePiece/bpe_sentencepiece_model.model b/myTokenize/SentencePiece/bpe_sentencepiece_model.model