-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0fcafe2
commit 5a6e96b
Showing
15 changed files
with
21,066 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "d5607014-b864-4018-be4d-2d3e12e71c0c", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Downloading myWord dictionary ...\n", | ||
"Download Completed.\n", | ||
"['မြန်မာ', 'နိုင်ငံ', '။']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from myTokenize import WordTokenizer\n", | ||
"\n", | ||
"tokenizer = WordTokenizer(engine=\"myWord\") # Use \"myWord\", \"CRF\", or \"LSTM\"\n", | ||
"words = tokenizer.tokenize(\"မြန်မာနိုင်ငံ။\")\n", | ||
"print(words) # ['မြန်မာ', 'နိုင်ငံ', '။']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "84961933-5255-4988-9b48-2a898498db4d", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Downloading neural network models ...\n", | ||
"LFS Not Found at Models/myWseg-s2-bilstm.h5\n", | ||
"Download Completed.\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.\n", | ||
"WARNING:absl:There is a known slowdown when using v2.11+ Keras optimizers on M1/M2 Macs. Falling back to the legacy Keras optimizer, i.e., `tf.keras.optimizers.legacy.Adam`.\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['မြန်မာ', 'နိုင်ငံ', '။']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from myTokenize import WordTokenizer\n", | ||
"\n", | ||
"tokenizer = WordTokenizer(engine=\"LSTM\") # Use \"myWord\", \"CRF\", or \"LSTM\"\n", | ||
"words = tokenizer.tokenize(\"မြန်မာနိုင်ငံ။\")\n", | ||
"print(words) # ['မြန်မာ', 'နိုင်ငံ', '။']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "5931d157-dd07-4053-b27f-f404b7bdebff", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['ရွေး', 'ကောက်', 'ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရမ့်']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from myTokenize import SyllableTokenizer\n", | ||
"\n", | ||
"tokenizer = SyllableTokenizer()\n", | ||
"tokens = tokenizer.tokenize(\"ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်\")\n", | ||
"print(tokens) # ['ရွေး', 'ကောက်', 'ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "5d012c2c-282c-4872-a3a9-ee8ae0147b64", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရမ်', '့']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from myTokenize import BPETokenizer\n", | ||
"\n", | ||
"tokenizer = BPETokenizer()\n", | ||
"tokens = tokenizer.tokenize(\"ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်\")\n", | ||
"print(tokens) # ['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "1ce5ab3a-edc3-494c-ab74-7d816b02231f", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ', '်', 'နယ်', 'ထ', 'ရမ်', '့']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from myTokenize import UnigramTokenizer\n", | ||
"\n", | ||
"tokenizer = UnigramTokenizer()\n", | ||
"tokens = tokenizer.tokenize(\"ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်\")\n", | ||
"print(tokens) # ['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "b37d314a-05df-462b-bfa5-43826c16e607", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['မြန်မာ', 'နိုင်ငံ', '။']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from myTokenize import WordTokenizer\n", | ||
"\n", | ||
"tokenizer = WordTokenizer(engine=\"CRF\") # Use \"myWord\", \"CRF\", or \"LSTM\"\n", | ||
"words = tokenizer.tokenize(\"မြန်မာနိုင်ငံ။\")\n", | ||
"print(words) # ['မြန်မာ', 'နိုင်ငံ', '။']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "7e739be1-8847-4a19-b72d-719482ff5ae8", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ']\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from myTokenize import PhraseTokenizer\n", | ||
"\n", | ||
"tokenizer = PhraseTokenizer()\n", | ||
"phrases = tokenizer.tokenize(\"ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါ\")\n", | ||
"print(phrases) # ['ညာဘက်_ကို', 'ယူ', 'ပြီး_တော့', 'တည့်တည့်', 'သွား_ပါ']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"id": "77629754-97ea-490e-9fbb-999afd86c401", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.\n", | ||
"WARNING:absl:There is a known slowdown when using v2.11+ Keras optimizers on M1/M2 Macs. Falling back to the legacy Keras optimizer, i.e., `tf.keras.optimizers.legacy.Adam`.\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ'], ['ခင်ဗျား', 'ငါး', 'မိနစ်', 'လောက်ကြာ', 'လိမ့်', 'မယ်']]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from myTokenize import SentenceTokenizer\n", | ||
"\n", | ||
"tokenizer = SentenceTokenizer()\n", | ||
"sentences = tokenizer.tokenize(\"ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါခင်ဗျားငါးမိနစ်လောက်ကြာလိမ့်မယ်\")\n", | ||
"print(sentences) # [['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ'], ['ခင်ဗျား', 'ငါး', 'မိနစ်', 'လောက်', 'ကြာ', 'လိမ့်', 'မယ်']]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "5545f19b-ff90-4fee-bada-96b1c1027526", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.20" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
recursive-include myTokenize/myWord *.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,113 @@ | ||
# myTokenize | ||
|
||
**myTokenize** is a Python library that tokenizes Myanmar text into syllables, words, phrases, and sentences. It supports multiple tokenization techniques using rule-based, statistical, and neural network-based approaches. | ||
|
||
## Features | ||
|
||
- **Syllable Tokenization**: Break text into syllables using regex rules. | ||
- **BPE and Unigram Tokenization**: Leverage SentencePiece models for tokenization. | ||
- **Word Tokenization**: Segment text into words using: | ||
- `myWord`: Dictionary-based tokenization. | ||
- `CRF`: Conditional Random Fields-based tokenization. | ||
- `BiLSTM`: Neural network-based tokenization. | ||
- **Phrase Tokenization**: Identify phrases in text using normalized pointwise mutual information (NPMI). | ||
- **Sentence Tokenization**: Use a BiLSTM model to segment text into sentences. | ||
|
||
## Installation | ||
|
||
1. Clone the repository: | ||
```bash | ||
git clone https://github.com/ThuraAung1601/myTokenize.git | ||
cd myTokenize | ||
``` | ||
|
||
2. Install dependencies: | ||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
3. Install the library: | ||
```bash | ||
pip install . | ||
``` | ||
|
||
## Usage | ||
|
||
### Syllable Tokenizer | ||
```python | ||
from myTokenize import SyllableTokenizer | ||
|
||
tokenizer = SyllableTokenizer() | ||
syllables = tokenizer.tokenize("မြန်မာနိုင်ငံ။") | ||
print(syllables) # ['မြန်', 'မာ', 'နိုင်', 'ငံ', '။'] | ||
``` | ||
|
||
### BPE Tokenizer | ||
```python | ||
from myTokenize import BPETokenizer | ||
|
||
tokenizer = BPETokenizer() | ||
tokens = tokenizer.tokenize("ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်") | ||
print(tokens) # ['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်'] | ||
``` | ||
|
||
### Word Tokenizer | ||
```python | ||
from myTokenize import WordTokenizer | ||
|
||
tokenizer = WordTokenizer(engine="CRF") # Use "myWord", "CRF", or "LSTM" | ||
words = tokenizer.tokenize("မြန်မာနိုင်ငံ။") | ||
print(words) # ['မြန်မာ', 'နိုင်ငံ', '။'] | ||
``` | ||
|
||
### Phrase Tokenizer | ||
```python | ||
from myTokenize import PhraseTokenizer | ||
|
||
tokenizer = PhraseTokenizer() | ||
phrases = tokenizer.tokenize("ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါ") | ||
print(phrases) # ['ညာဘက်_ကို', 'ယူ', 'ပြီး_တော့', 'တည့်တည့်', 'သွား_ပါ'] | ||
``` | ||
|
||
### Sentence Tokenizer | ||
```python | ||
from myTokenize import SentenceTokenizer | ||
|
||
tokenizer = SentenceTokenizer() | ||
sentences = tokenizer.tokenize("ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါခင်ဗျားငါးမိနစ်လောက်ကြာလိမ့်မယ်") | ||
print(sentences) # [['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ'], ['ခင်ဗျား', 'ငါး', 'မိနစ်', 'လောက်', 'ကြာ', 'လိမ့်', 'မယ်']] | ||
``` | ||
|
||
## Folder Structure | ||
|
||
``` | ||
./myTokenize/ | ||
├── CRFTokenizer | ||
│ └── wordseg_c2_crf.crfsuite | ||
├── SentencePiece | ||
│ ├── bpe_sentencepiece_model.model | ||
│ ├── bpe_sentencepiece_model.vocab | ||
│ ├── unigram_sentencepiece_model.model | ||
│ └── unigram_sentencepiece_model.vocab | ||
├── Tokenizer.py | ||
└── myWord | ||
├── phrase_segment.py | ||
└── word_segment.py | ||
``` | ||
|
||
## Dependencies | ||
|
||
- Python 3.7+ | ||
- TensorFlow | ||
- SentencePiece | ||
- pycrfsuite | ||
- Numpy | ||
|
||
## License | ||
|
||
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. | ||
|
||
## Authors | ||
|
||
- **Ye Kyaw Thu** | ||
- **Thura Aung** |
Binary file not shown.
Binary file not shown.
Oops, something went wrong.