Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ThuraAung1601 authored Nov 22, 2024
1 parent 0fcafe2 commit 5a6e96b
Show file tree
Hide file tree
Showing 15 changed files with 21,066 additions and 2 deletions.
236 changes: 236 additions & 0 deletions Demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d5607014-b864-4018-be4d-2d3e12e71c0c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading myWord dictionary ...\n",
"Download Completed.\n",
"['မြန်မာ', 'နိုင်ငံ', '။']\n"
]
}
],
"source": [
"from myTokenize import WordTokenizer\n",
"\n",
"tokenizer = WordTokenizer(engine=\"myWord\") # Use \"myWord\", \"CRF\", or \"LSTM\"\n",
"words = tokenizer.tokenize(\"မြန်မာနိုင်ငံ။\")\n",
"print(words) # ['မြန်မာ', 'နိုင်ငံ', '။']"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "84961933-5255-4988-9b48-2a898498db4d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading neural network models ...\n",
"LFS Not Found at Models/myWseg-s2-bilstm.h5\n",
"Download Completed.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.\n",
"WARNING:absl:There is a known slowdown when using v2.11+ Keras optimizers on M1/M2 Macs. Falling back to the legacy Keras optimizer, i.e., `tf.keras.optimizers.legacy.Adam`.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"['မြန်မာ', 'နိုင်ငံ', '။']\n"
]
}
],
"source": [
"from myTokenize import WordTokenizer\n",
"\n",
"tokenizer = WordTokenizer(engine=\"LSTM\") # Use \"myWord\", \"CRF\", or \"LSTM\"\n",
"words = tokenizer.tokenize(\"မြန်မာနိုင်ငံ။\")\n",
"print(words) # ['မြန်မာ', 'နိုင်ငံ', '။']"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5931d157-dd07-4053-b27f-f404b7bdebff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ရွေး', 'ကောက်', 'ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရမ့်']\n"
]
}
],
"source": [
"from myTokenize import SyllableTokenizer\n",
"\n",
"tokenizer = SyllableTokenizer()\n",
"tokens = tokenizer.tokenize(\"ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်\")\n",
"print(tokens) # ['ရွေး', 'ကောက်', 'ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5d012c2c-282c-4872-a3a9-ee8ae0147b64",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရမ်', '့']\n"
]
}
],
"source": [
"from myTokenize import BPETokenizer\n",
"\n",
"tokenizer = BPETokenizer()\n",
"tokens = tokenizer.tokenize(\"ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်\")\n",
"print(tokens) # ['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1ce5ab3a-edc3-494c-ab74-7d816b02231f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ', '်', 'နယ်', 'ထ', 'ရမ်', '့']\n"
]
}
],
"source": [
"from myTokenize import UnigramTokenizer\n",
"\n",
"tokenizer = UnigramTokenizer()\n",
"tokens = tokenizer.tokenize(\"ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်\")\n",
"print(tokens) # ['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b37d314a-05df-462b-bfa5-43826c16e607",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['မြန်မာ', 'နိုင်ငံ', '။']\n"
]
}
],
"source": [
"from myTokenize import WordTokenizer\n",
"\n",
"tokenizer = WordTokenizer(engine=\"CRF\") # Use \"myWord\", \"CRF\", or \"LSTM\"\n",
"words = tokenizer.tokenize(\"မြန်မာနိုင်ငံ။\")\n",
"print(words) # ['မြန်မာ', 'နိုင်ငံ', '။']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "7e739be1-8847-4a19-b72d-719482ff5ae8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ']\n"
]
}
],
"source": [
"from myTokenize import PhraseTokenizer\n",
"\n",
"tokenizer = PhraseTokenizer()\n",
"phrases = tokenizer.tokenize(\"ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါ\")\n",
"print(phrases) # ['ညာဘက်_ကို', 'ယူ', 'ပြီး_တော့', 'တည့်တည့်', 'သွား_ပါ']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "77629754-97ea-490e-9fbb-999afd86c401",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.\n",
"WARNING:absl:There is a known slowdown when using v2.11+ Keras optimizers on M1/M2 Macs. Falling back to the legacy Keras optimizer, i.e., `tf.keras.optimizers.legacy.Adam`.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ'], ['ခင်ဗျား', 'ငါး', 'မိနစ်', 'လောက်ကြာ', 'လိမ့်', 'မယ်']]\n"
]
}
],
"source": [
"from myTokenize import SentenceTokenizer\n",
"\n",
"tokenizer = SentenceTokenizer()\n",
"sentences = tokenizer.tokenize(\"ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါခင်ဗျားငါးမိနစ်လောက်ကြာလိမ့်မယ်\")\n",
"print(sentences) # [['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ'], ['ခင်ဗျား', 'ငါး', 'မိနစ်', 'လောက်', 'ကြာ', 'လိမ့်', 'မယ်']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5545f19b-ff90-4fee-bada-96b1c1027526",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
4 changes: 2 additions & 2 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2024 June
Copyright (c) 2024 Ye Kyaw Thu and Thura Aung

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand All @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
SOFTWARE.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include myTokenize/myWord *.py
112 changes: 112 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,113 @@
# myTokenize

**myTokenize** is a Python library that tokenizes Myanmar text into syllables, words, phrases, and sentences. It supports multiple tokenization techniques using rule-based, statistical, and neural network-based approaches.

## Features

- **Syllable Tokenization**: Break text into syllables using regex rules.
- **BPE and Unigram Tokenization**: Leverage SentencePiece models for tokenization.
- **Word Tokenization**: Segment text into words using:
- `myWord`: Dictionary-based tokenization.
- `CRF`: Conditional Random Fields-based tokenization.
- `BiLSTM`: Neural network-based tokenization.
- **Phrase Tokenization**: Identify phrases in text using normalized pointwise mutual information (NPMI).
- **Sentence Tokenization**: Use a BiLSTM model to segment text into sentences.

## Installation

1. Clone the repository:
```bash
git clone https://github.com/ThuraAung1601/myTokenize.git
cd myTokenize
```

2. Install dependencies:
```bash
pip install -r requirements.txt
```

3. Install the library:
```bash
pip install .
```

## Usage

### Syllable Tokenizer
```python
from myTokenize import SyllableTokenizer

tokenizer = SyllableTokenizer()
syllables = tokenizer.tokenize("မြန်မာနိုင်ငံ။")
print(syllables) # ['မြန်', 'မာ', 'နိုင်', 'ငံ', '။']
```

### BPE Tokenizer
```python
from myTokenize import BPETokenizer

tokenizer = BPETokenizer()
tokens = tokenizer.tokenize("ရွေးကောက်ပွဲမှာနိုင်ထားတဲ့ဒေါ်နယ်ထရမ့်")
print(tokens) # ['▁ရွေးကောက်ပွဲ', 'မှာ', 'နိုင်', 'ထား', 'တဲ့', 'ဒေါ်', 'နယ်', 'ထ', 'ရ', 'မ့်']
```

### Word Tokenizer
```python
from myTokenize import WordTokenizer

tokenizer = WordTokenizer(engine="CRF") # Use "myWord", "CRF", or "LSTM"
words = tokenizer.tokenize("မြန်မာနိုင်ငံ။")
print(words) # ['မြန်မာ', 'နိုင်ငံ', '။']
```

### Phrase Tokenizer
```python
from myTokenize import PhraseTokenizer

tokenizer = PhraseTokenizer()
phrases = tokenizer.tokenize("ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါ")
print(phrases) # ['ညာဘက်_ကို', 'ယူ', 'ပြီး_တော့', 'တည့်တည့်', 'သွား_ပါ']
```

### Sentence Tokenizer
```python
from myTokenize import SentenceTokenizer

tokenizer = SentenceTokenizer()
sentences = tokenizer.tokenize("ညာဘက်ကိုယူပြီးတော့တည့်တည့်သွားပါခင်ဗျားငါးမိနစ်လောက်ကြာလိမ့်မယ်")
print(sentences) # [['ညာ', 'ဘက်', 'ကို', 'ယူ', 'ပြီး', 'တော့', 'တည့်တည့်', 'သွား', 'ပါ'], ['ခင်ဗျား', 'ငါး', 'မိနစ်', 'လောက်', 'ကြာ', 'လိမ့်', 'မယ်']]
```

## Folder Structure

```
./myTokenize/
├── CRFTokenizer
│ └── wordseg_c2_crf.crfsuite
├── SentencePiece
│ ├── bpe_sentencepiece_model.model
│ ├── bpe_sentencepiece_model.vocab
│ ├── unigram_sentencepiece_model.model
│ └── unigram_sentencepiece_model.vocab
├── Tokenizer.py
└── myWord
├── phrase_segment.py
└── word_segment.py
```

## Dependencies

- Python 3.7+
- TensorFlow
- SentencePiece
- pycrfsuite
- Numpy

## License

This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.

## Authors

- **Ye Kyaw Thu**
- **Thura Aung**
Binary file added myTokenize/CRFTokenizer/wordseg_c2_crf.crfsuite
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 5a6e96b

Please sign in to comment.