From 360c72e60a51c61881d53a2babdadf16072883d9 Mon Sep 17 00:00:00 2001
From: bgo-eiu <100172442+bgo-eiu@users.noreply.github.com>
Date: Sun, 26 Mar 2023 20:39:05 -0400
Subject: [PATCH 1/4] add Perso-Arabic punctuation to filter

---
 corpora-parse.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/corpora-parse.py b/corpora-parse.py
index 44519c9..698a6f4 100644
--- a/corpora-parse.py
+++ b/corpora-parse.py
@@ -34,6 +34,8 @@
 			b"\xe2\x80\x9c", b"\xe2\x80\x9d", b"\xe2\x80\x9e", b"\xe2\x80\x9f",
 			# Devanagari danda and double danda
 			b"\xe0\xa5\xa4", b"\xe0\xa5\xa5",
+                        # Perso-Arabic punctuation
+                        b"\xdb\x94\x20", b"\xd8\x8C\x20", b"\xd8\x9b\x20", b"\xd8\x9f\x20", b"\xc2\xbb\x20", b"\xc2\xab\x20", b"\xd9\xaa\x20", b"\xe0\xa5\xa4",
 		]:
 			line = line.replace(c, b" ")
 		words = line.split()

From a1ef8961064bd73fd0e501771f467ae5d9aa2c97 Mon Sep 17 00:00:00 2001
From: bgo-eiu <100172442+bgo-eiu@users.noreply.github.com>
Date: Sun, 26 Mar 2023 21:07:45 -0400
Subject: [PATCH 2/4] add pa, pnb, sd, ur to language data

---
 language-data.json | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/language-data.json b/language-data.json
index 526cf84..7f00ac7 100644
--- a/language-data.json
+++ b/language-data.json
@@ -225,6 +225,24 @@
 		"infopage": "https://wortschatz.uni-leipzig.de/en/download/Norwegian%20Nynorsk",
 		"source": "unileipzig"
 	},
+        "pa" {
+                "qid": "Q58635",
+                "remotefile": "pan_wikipedia_2021_300K.tar.gz",
+                "infopage": "https://wortschatz.uni-leipzig.de/en/download/Panjabi",
+                "source": "unileipzig"
+        },
+        "pnb" {
+                "qid": "Q58635",
+                "remotefile": "pnb_wikipedia_2021_300K.tar.gz",
+                "infopage": "https://wortschatz.uni-leipzig.de/en/download/Western%20Panjabi",
+                "source": "unileipzig"
+        },
+        "sd": {
+                "qid": "Q33997",
+                "remotefile": "snd_wikipedia_2021_100K.tar.gz",
+                "infopage": "https://wortschatz.uni-leipzig.de/en/download/Sindhi",
+                "source": "unileipzig"
+        },
 	"se": {
 		"qid": "Q33947",
 		"remotefile": "sme-no_web_2013_10K.tar.gz",
@@ -242,6 +260,12 @@
 		"remotefile": "tgk_wikipedia_2016_100K.tar.gz",
 		"infopage": "https://wortschatz.uni-leipzig.de/en/download/Tajik",
 		"source": "unileipzig"
-	}
+	},
+        "ur": {
+               "qid": "Q11051",
+               "remotefile": "urd_wikipedia_2021_300K.tar.gz",
+               "infopage": "https://wortschatz.uni-leipzig.de/en/download/Urdu',
+               "source": "unileipzig"
+        }
 
 }

From dfd84d64e6c54c8752712dbd5ff7ac565aab4e3a Mon Sep 17 00:00:00 2001
From: bgo-eiu <100172442+bgo-eiu@users.noreply.github.com>
Date: Sun, 26 Mar 2023 21:08:18 -0400
Subject: [PATCH 3/4] fix quotation mark

---
 language-data.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/language-data.json b/language-data.json
index 7f00ac7..28450e2 100644
--- a/language-data.json
+++ b/language-data.json
@@ -264,7 +264,7 @@
         "ur": {
                "qid": "Q11051",
                "remotefile": "urd_wikipedia_2021_300K.tar.gz",
-               "infopage": "https://wortschatz.uni-leipzig.de/en/download/Urdu',
+               "infopage": "https://wortschatz.uni-leipzig.de/en/download/Urdu",
                "source": "unileipzig"
         }
 

From 1db2106ca4d5dcce9839875eec1c3490e3a41c40 Mon Sep 17 00:00:00 2001
From: bgo-eiu <100172442+bgo-eiu@users.noreply.github.com>
Date: Fri, 7 Apr 2023 16:01:14 -0400
Subject: [PATCH 4/4] Fix Arabic punctuation UTF-8 code units

---
 corpora-parse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/corpora-parse.py b/corpora-parse.py
index 698a6f4..bb2ad9d 100644
--- a/corpora-parse.py
+++ b/corpora-parse.py
@@ -35,7 +35,7 @@
 			# Devanagari danda and double danda
 			b"\xe0\xa5\xa4", b"\xe0\xa5\xa5",
                         # Perso-Arabic punctuation
-                        b"\xdb\x94\x20", b"\xd8\x8C\x20", b"\xd8\x9b\x20", b"\xd8\x9f\x20", b"\xc2\xbb\x20", b"\xc2\xab\x20", b"\xd9\xaa\x20", b"\xe0\xa5\xa4",
+                        b"\xdb\x94", b"\xd8\x8C", b"\xd8\x9b", b"\xd8\x9f", b"\xc2\xbb", b"\xc2\xab", b"\xd9\xaa",
 		]:
 			line = line.replace(c, b" ")
 		words = line.split()