diff --git a/corpora-parse.py b/corpora-parse.py index 44519c9..bb2ad9d 100644 --- a/corpora-parse.py +++ b/corpora-parse.py @@ -34,6 +34,8 @@ b"\xe2\x80\x9c", b"\xe2\x80\x9d", b"\xe2\x80\x9e", b"\xe2\x80\x9f", # Devanagari danda and double danda b"\xe0\xa5\xa4", b"\xe0\xa5\xa5", + # Perso-Arabic punctuation + b"\xdb\x94", b"\xd8\x8C", b"\xd8\x9b", b"\xd8\x9f", b"\xc2\xbb", b"\xc2\xab", b"\xd9\xaa", ]: line = line.replace(c, b" ") words = line.split() diff --git a/language-data.json b/language-data.json index 526cf84..28450e2 100644 --- a/language-data.json +++ b/language-data.json @@ -225,6 +225,24 @@ "infopage": "https://wortschatz.uni-leipzig.de/en/download/Norwegian%20Nynorsk", "source": "unileipzig" }, + "pa" { + "qid": "Q58635", + "remotefile": "pan_wikipedia_2021_300K.tar.gz", + "infopage": "https://wortschatz.uni-leipzig.de/en/download/Panjabi", + "source": "unileipzig" + }, + "pnb" { + "qid": "Q58635", + "remotefile": "pnb_wikipedia_2021_300K.tar.gz", + "infopage": "https://wortschatz.uni-leipzig.de/en/download/Western%20Panjabi", + "source": "unileipzig" + }, + "sd": { + "qid": "Q33997", + "remotefile": "snd_wikipedia_2021_100K.tar.gz", + "infopage": "https://wortschatz.uni-leipzig.de/en/download/Sindhi", + "source": "unileipzig" + }, "se": { "qid": "Q33947", "remotefile": "sme-no_web_2013_10K.tar.gz", @@ -242,6 +260,12 @@ "remotefile": "tgk_wikipedia_2016_100K.tar.gz", "infopage": "https://wortschatz.uni-leipzig.de/en/download/Tajik", "source": "unileipzig" - } + }, + "ur": { + "qid": "Q11051", + "remotefile": "urd_wikipedia_2021_300K.tar.gz", + "infopage": "https://wortschatz.uni-leipzig.de/en/download/Urdu", + "source": "unileipzig" + } }