From 360c72e60a51c61881d53a2babdadf16072883d9 Mon Sep 17 00:00:00 2001 From: bgo-eiu <100172442+bgo-eiu@users.noreply.github.com> Date: Sun, 26 Mar 2023 20:39:05 -0400 Subject: [PATCH 1/4] add Perso-Arabic punctuation to filter --- corpora-parse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/corpora-parse.py b/corpora-parse.py index 44519c9..698a6f4 100644 --- a/corpora-parse.py +++ b/corpora-parse.py @@ -34,6 +34,8 @@ b"\xe2\x80\x9c", b"\xe2\x80\x9d", b"\xe2\x80\x9e", b"\xe2\x80\x9f", # Devanagari danda and double danda b"\xe0\xa5\xa4", b"\xe0\xa5\xa5", + # Perso-Arabic punctuation + b"\xdb\x94\x20", b"\xd8\x8C\x20", b"\xd8\x9b\x20", b"\xd8\x9f\x20", b"\xc2\xbb\x20", b"\xc2\xab\x20", b"\xd9\xaa\x20", b"\xe0\xa5\xa4", ]: line = line.replace(c, b" ") words = line.split() From a1ef8961064bd73fd0e501771f467ae5d9aa2c97 Mon Sep 17 00:00:00 2001 From: bgo-eiu <100172442+bgo-eiu@users.noreply.github.com> Date: Sun, 26 Mar 2023 21:07:45 -0400 Subject: [PATCH 2/4] add pa, pnb, sd, ur to language data --- language-data.json | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/language-data.json b/language-data.json index 526cf84..7f00ac7 100644 --- a/language-data.json +++ b/language-data.json @@ -225,6 +225,24 @@ "infopage": "https://wortschatz.uni-leipzig.de/en/download/Norwegian%20Nynorsk", "source": "unileipzig" }, + "pa" { + "qid": "Q58635", + "remotefile": "pan_wikipedia_2021_300K.tar.gz", + "infopage": "https://wortschatz.uni-leipzig.de/en/download/Panjabi", + "source": "unileipzig" + }, + "pnb" { + "qid": "Q58635", + "remotefile": "pnb_wikipedia_2021_300K.tar.gz", + "infopage": "https://wortschatz.uni-leipzig.de/en/download/Western%20Panjabi", + "source": "unileipzig" + }, + "sd": { + "qid": "Q33997", + "remotefile": "snd_wikipedia_2021_100K.tar.gz", + "infopage": "https://wortschatz.uni-leipzig.de/en/download/Sindhi", + "source": "unileipzig" + }, "se": { "qid": "Q33947", "remotefile": "sme-no_web_2013_10K.tar.gz", @@ -242,6 +260,12 @@ "remotefile": "tgk_wikipedia_2016_100K.tar.gz", "infopage": "https://wortschatz.uni-leipzig.de/en/download/Tajik", "source": "unileipzig" - } + }, + "ur": { + "qid": "Q11051", + "remotefile": "urd_wikipedia_2021_300K.tar.gz", + "infopage": "https://wortschatz.uni-leipzig.de/en/download/Urdu', + "source": "unileipzig" + } } From dfd84d64e6c54c8752712dbd5ff7ac565aab4e3a Mon Sep 17 00:00:00 2001 From: bgo-eiu <100172442+bgo-eiu@users.noreply.github.com> Date: Sun, 26 Mar 2023 21:08:18 -0400 Subject: [PATCH 3/4] fix quotation mark --- language-data.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language-data.json b/language-data.json index 7f00ac7..28450e2 100644 --- a/language-data.json +++ b/language-data.json @@ -264,7 +264,7 @@ "ur": { "qid": "Q11051", "remotefile": "urd_wikipedia_2021_300K.tar.gz", - "infopage": "https://wortschatz.uni-leipzig.de/en/download/Urdu', + "infopage": "https://wortschatz.uni-leipzig.de/en/download/Urdu", "source": "unileipzig" } From 1db2106ca4d5dcce9839875eec1c3490e3a41c40 Mon Sep 17 00:00:00 2001 From: bgo-eiu <100172442+bgo-eiu@users.noreply.github.com> Date: Fri, 7 Apr 2023 16:01:14 -0400 Subject: [PATCH 4/4] Fix Arabic punctuation UTF-8 code units --- corpora-parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corpora-parse.py b/corpora-parse.py index 698a6f4..bb2ad9d 100644 --- a/corpora-parse.py +++ b/corpora-parse.py @@ -35,7 +35,7 @@ # Devanagari danda and double danda b"\xe0\xa5\xa4", b"\xe0\xa5\xa5", # Perso-Arabic punctuation - b"\xdb\x94\x20", b"\xd8\x8C\x20", b"\xd8\x9b\x20", b"\xd8\x9f\x20", b"\xc2\xbb\x20", b"\xc2\xab\x20", b"\xd9\xaa\x20", b"\xe0\xa5\xa4", + b"\xdb\x94", b"\xd8\x8C", b"\xd8\x9b", b"\xd8\x9f", b"\xc2\xbb", b"\xc2\xab", b"\xd9\xaa", ]: line = line.replace(c, b" ") words = line.split()