Merge pull request #33 from Ayushk4/handle_final_periods

Handle final periods
JuliaText · Jun 27, 2019 · cc225ea · cc225ea
2 parents 73e7d71 + 9ec8517
commit cc225ea
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -70,7 +70,7 @@ The word tokenizers basically assume sentence splitting has already been done.
   (To me it seems like a weird historical thing that NLTK has 2 successive variation on improving the Penn tokenizer, but for now I am matching it and having both.  See [[NLTK#2005]](https://github.com/nltk/nltk/issues/2005))
 
  - **Reversible Tokenizer:** (`rev_tokenize` and `rev_detokenize`) This tokenizer splits on punctuations, space and special symbols. The generated tokens can be de-tokenized by using the `rev_detokenizer` function into the state before tokenization.
- - **TokTok Tokenizer:** (`toktok_tokenize`) This tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized. Tok-tok has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. **(default tokenizer)**
+ - **TokTok Tokenizer:** (`toktok_tokenize`) This tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized. This is an enhanced version of the [original toktok Tokenizer](https://github.com/jonsafari/tok-tok). It has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. **(default tokenizer)**
  - **Tweet Tokenizer:** (`tweet_tokenizer`) NLTK's casual tokenizer for that is solely designed for tweets. Apart from twitter specific, this tokenizer has good handling for emoticons, and other web aspects like support for HTML Entities. This closely matches NLTK's `nltk.tokenize.TweetTokenizer`
 
 
@@ -105,7 +105,6 @@ julia> tokenize.(split_sentences(text))
  SubString{String}["The", "Flatback", "turtle", "is", "found", "solely", "on", "the", "northern", "coast", "of", "Australia", "."]
 ```
 
-
 ## Experimental API
 I am trying out an experimental API
 where these are added as dispatches to `Base.split`

diff --git a/src/words/TokTok.jl b/src/words/TokTok.jl
@@ -72,10 +72,11 @@ const rules_replaces = Tuple(Iterators.flatten([
 
 
 """
-    totok_tokenize(instring::AstractString)
+    toktok_tokenize(instring::AstractString)
 
 This tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized.
-Tok-tok has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese,
+This is an enhanced version of the [original toktok Tokenizer](https://github.com/jonsafari/tok-tok)
+It has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese,
 Tajik, and a few others.
 """
 function toktok_tokenize(instring::AbstractString)
@@ -122,36 +123,38 @@ Don't tokenize period unless it ends the line (FINAL_PERIOD_2)
 """
 function handle_final_periods(ts::TokenBuffer)
     effective_end = length(ts.input)
+
+    # handles spaces
+    while effective_end >=1 && isspace(ts.input[effective_end])
+        effective_end -= 1
+    end
+
     # handles FINAL_PERIOD_1 = r"(?<!\.)\.$"
-    if length(ts.input) >= 2 && ts.input[end] == '.' && ts.input[end-1] != '.'
+    if effective_end > 1 && length(ts.input) >= 2 && ts.input[effective_end] == '.' && ts.input[effective_end-1] != '.'
         effective_end -= 1
         return effective_end, ".", nothing
     end
 
     # handles FINAL_PERIOD_2 = r"(?<!\.)\.\s*(["'’»›”]) *$"
-    if ts.input[end] in ('\"', '“', '”', '‘', '’', '›') || isspace(ts.input[end])
-        while effective_end >=1 && isspace(ts.input[effective_end] )
-            effective_end -= 1
-        end
+    if effective_end > 1 && ts.input[effective_end] in ('\"', '“', '”', '‘', '’', '›')
+        token_position = effective_end
+        effective_end -= 1
 
-        if effective_end > 1 && ts.input[effective_end] in ('\"', '“', '”', '‘', '’', '›')
-            token_position = effective_end
+        while effective_end >=1 && isspace(ts.input[effective_end])
             effective_end -= 1
+        end
 
-            while effective_end >=1 && isspace(ts.input[effective_end] )
+        if effective_end > 1 && ts.input[effective_end] == '.'
+            if effective_end >= 2 && ts.input[effective_end - 1] == '.'
+                return token_position + 1, nothing, nothing # No use iterating over spaces again.
+            else
                 effective_end -= 1
-            end
-
-            if effective_end > 1 && ts.input[effective_end] == '.'
-                if effective_end >= 2 && ts.input[effective_end - 1] == '.'
-                    return length(ts.input), nothing, nothing
-                else
-                    effective_end -= 1
-                    return effective_end, ".",string(ts.input[token_position])
-                end
+                return effective_end, ".", string(ts.input[token_position])
             end
         end
+        return effective_end, string(ts.input[token_position]), nothing
     end
+
     return effective_end, nothing, nothing
 end
 

diff --git a/test/toktok.jl b/test/toktok.jl
@@ -156,7 +156,7 @@ french_tokenized = ["Maître", "Corbeau", ",", "sur", "un", "arbre", "perché",
     "Monsieur", ",", "Apprenez", "que", "tout", "flatteur", "Vit", "aux", "dépens", "de", "celui", "qui", "l", "’",
     "écoute", ":", "Cette", "leçon", "vaut", "bien", "un", "fromage", ",", "sans", "doute.", "»", "Le", "Corbeau",
     ",", "honteux", "et", "confus", ",", "Jura", ",", "mais", "un", "peu", "tard", ",", "qu", "’", "on", "ne", "l",
-    "’", "y", "prendrait", "plus.", ]
+    "’", "y", "prendrait", "plus", "."]
 
 
 old_english_tokenized = ["An.", "M.LXVI.", "On", "þyssum", "geare", "man", "halgode", "þet", "mynster", "æt",
@@ -170,7 +170,7 @@ old_english_tokenized = ["An.", "M.LXVI.", "On", "þyssum", "geare", "man", "hal
     "gefeaht", "ear", "þan", "þe", "his", "here", "com", "eall", "7", "þær", "he", "feoll", "7", "his", "twægen",
     "gebroðra", "Gyrð", "7", "Leofwine", "and", "Willelm", "þis", "land", "geeode", "7", "com", "to", "Westmynstre",
     "7", "Ealdred", "arceb", "hine", "to", "cynge", "gehalgode", "7", "menn", "guldon", "him", "gyld", "7", "gislas",
-    "sealdon", "7", "syððan", "heora", "land", "bohtan.", ]
+    "sealdon", "7", "syððan", "heora", "land", "bohtan", "."]
 
 
 russian_tokenized = ["Лорем", "ипсум", "долор", "сит", "амет", ",", "яуи", "ин", "реяуе", "пертинациа", ",",
@@ -180,7 +180,7 @@ russian_tokenized = ["Лорем", "ипсум", "долор", "сит", "аме
     "пертинах", "малуиссет", "ин", "усу.", "Еам", "еу", "еиус", "поссе.", "Сеа", "еи", "малорум", "ассентиор.", "Алии",
     "мутат", "персиус", "усу", "но", ",", "цу", "вих", "ирацундиа", "цонсететур", ",", "цоррумпит", "форенсибус", "диссентиунт",
     "но", "иус.", "Ессе", "цибо", "нонумес", "ин", "сеа.", "Доминг", "еурипидис", "модератиус", "сеа", "ут", ",",
-    "алии", "иллуд", "граецис", "ет", "сед.", "Цу", "путент", "десеруиссе", "еам", ".",]
+    "алии", "иллуд", "граецис", "ет", "сед.", "Цу", "путент", "десеруиссе", "еам", "."]
 
 
 spanish_tokenized = ["Mentiría", "si", "dijera", "que", "era", "del", "todo", "nuevo", "el", "sentimiento", "de",
@@ -195,7 +195,7 @@ spanish_tokenized = ["Mentiría", "si", "dijera", "que", "era", "del", "todo", "
     "lo", "irremediable.", "Despertarse", ",", "ventearse", "como", "un", "perro", "la", "vida", ",", "ocuparse",
     "de", "sus", "asuntillos", ",", "sacar", "provecho", "de", "ellos", ",", "comer", ",", "beber", ",", "dormir.",
     "Ahora", ",", "sólo", "ahora", ",", "cuando", "estaba", "de", "verdad", "solo", ",", "sabía", "que", "la", "vida",
-    "se", "escapa", "por", "las", "buenas", ",", "corre", "mucho", ]
+    "se", "escapa", "por", "las", "buenas", ",", "corre", "mucho"]
 
 
 farsi_tokenized = ["مادهٔ", "بیست", "و", "ششم", "1", ")", "هر", "کس", "حق", "دارد", "که",
@@ -215,7 +215,7 @@ farsi_tokenized = ["مادهٔ", "بیست", "و", "ششم", "1", ")", "هر", "
     "دارد", "آزادانه", "در", "زندگی", "فرهنگی", "اجتما", "عی", "شرکت", "کند", "،", "از",
     "فنون", "و", "هنرها", "متمتع", "گردد", "و", "در", "پیشرفت", "علمی", "و", "فوائد", "آن",
     "سهیم", "باشد.", "2", ")", "هر", "کس", "حق", "دارد", "از", "حمایت", "منافع", "معنوی",
-    "و", "مادی", "آثار", "علمی", "،", "فرهنگی", "یا", "هنری", "خود", "برخوردار", "شود", ".", ]
+    "و", "مادی", "آثار", "علمی", "،", "فرهنگی", "یا", "هنری", "خود", "برخوردار", "شود", "."]
 
 
 chez_tokenized = ["Článek", "26", "Každý", "má", "právo", "na", "vzdělání.", "Vzdělání", "nechť", "je", "bezplatné",
@@ -230,7 +230,7 @@ chez_tokenized = ["Článek", "26", "Každý", "má", "právo", "na", "vzdělán
     "má", "právo", "svobodně", "se", "účastnit", "kulturního", "života", "společnosti", ",", "úžívat", "plodů",
     "umění", "a", "podílet", "se", "na", "vědeckém", "pokroku", "a", "jeho", "výtěžcích.", "Každý", "má", "právo",
     "na", "ochranu", "morálních", "a", "materiálních", "zájmů", ",", "které", "vyplývají", "z", "jeho", "vědecké",
-    ",", "literární", "nebo", "umělecké", "tvorby.", ]
+    ",", "literární", "nebo", "umělecké", "tvorby", "."]
 
 
 vietnamese_tokenized = ["Điều", "26", ":", "1", ")", "Mọi", "người", "đều", "có", "quyền", "được", "học", "hành.",
@@ -250,7 +250,7 @@ vietnamese_tokenized = ["Điều", "26", ":", "1", ")", "Mọi", "người", "
     "xẻ", "những", "thành", "tựu", "và", "lợi", "ích", "của", "tiến", "bộ", "khoa", "học.", "2", ")", "Mọi",
     "người", "đều", "có", "quyền", "được", "bảo", "hộ", "đối", "với", "những", "quyền", "lợi", "về", "vật",
     "chất", "và", "tinh", "thần", "xuất", "phát", "từ", "công", "trình", "khoa", "học", ",", "văn", "học",
-    "và", "nhgệ", "thuật", "mà", "người", "đó", "là", "tác", "giả.", ]
+    "và", "nhgệ", "thuật", "mà", "người", "đó", "là", "tác", "giả", "."]
 
 
 
@@ -284,6 +284,16 @@ end
         tokenized = ["1", ")", "example", "sentence.", "2", ")", "example", "sentence", "‘"]
         @test tokenized == toktok_tokenize(str)
     end
+
+	@testset "spaces in final period" begin
+		str = "This is a sentence. "
+		tokenized = [ "This", "is", "a", "sentence", "."]
+		@test tokenized == toktok_tokenize(str)
+
+		str = "They say, \"This is a sentence . \" "
+		tokenized = [ "They", "say", ",", "\"", "This", "is", "a", "sentence", ".", "\""]
+		@test tokenized == toktok_tokenize(str)
+	end
 end