Skip to content

Commit

Permalink
Merge pull request #33 from Ayushk4/handle_final_periods
Browse files Browse the repository at this point in the history
Handle final periods
  • Loading branch information
Ayushk4 authored Jun 27, 2019
2 parents 73e7d71 + 9ec8517 commit cc225ea
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 28 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ The word tokenizers basically assume sentence splitting has already been done.
(To me it seems like a weird historical thing that NLTK has 2 successive variation on improving the Penn tokenizer, but for now I am matching it and having both. See [[NLTK#2005]](https://github.com/nltk/nltk/issues/2005))

- **Reversible Tokenizer:** (`rev_tokenize` and `rev_detokenize`) This tokenizer splits on punctuations, space and special symbols. The generated tokens can be de-tokenized by using the `rev_detokenizer` function into the state before tokenization.
- **TokTok Tokenizer:** (`toktok_tokenize`) This tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized. Tok-tok has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. **(default tokenizer)**
- **TokTok Tokenizer:** (`toktok_tokenize`) This tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized. This is an enhanced version of the [original toktok Tokenizer](https://github.com/jonsafari/tok-tok). It has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. **(default tokenizer)**
- **Tweet Tokenizer:** (`tweet_tokenizer`) NLTK's casual tokenizer for that is solely designed for tweets. Apart from twitter specific, this tokenizer has good handling for emoticons, and other web aspects like support for HTML Entities. This closely matches NLTK's `nltk.tokenize.TweetTokenizer`


Expand Down Expand Up @@ -105,7 +105,6 @@ julia> tokenize.(split_sentences(text))
SubString{String}["The", "Flatback", "turtle", "is", "found", "solely", "on", "the", "northern", "coast", "of", "Australia", "."]
```


## Experimental API
I am trying out an experimental API
where these are added as dispatches to `Base.split`
Expand Down
41 changes: 22 additions & 19 deletions src/words/TokTok.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,11 @@ const rules_replaces = Tuple(Iterators.flatten([


"""
totok_tokenize(instring::AstractString)
toktok_tokenize(instring::AstractString)
This tokenizer is a simple, general tokenizer, where the input has one sentence per line; thus only final period is tokenized.
Tok-tok has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese,
This is an enhanced version of the [original toktok Tokenizer](https://github.com/jonsafari/tok-tok)
It has been tested on and gives reasonably good results for English, Persian, Russian, Czech, French, German, Vietnamese,
Tajik, and a few others.
"""
function toktok_tokenize(instring::AbstractString)
Expand Down Expand Up @@ -122,36 +123,38 @@ Don't tokenize period unless it ends the line (FINAL_PERIOD_2)
"""
function handle_final_periods(ts::TokenBuffer)
effective_end = length(ts.input)

# handles spaces
while effective_end >=1 && isspace(ts.input[effective_end])
effective_end -= 1
end

# handles FINAL_PERIOD_1 = r"(?<!\.)\.$"
if length(ts.input) >= 2 && ts.input[end] == '.' && ts.input[end-1] != '.'
if effective_end > 1 && length(ts.input) >= 2 && ts.input[effective_end] == '.' && ts.input[effective_end-1] != '.'
effective_end -= 1
return effective_end, ".", nothing
end

# handles FINAL_PERIOD_2 = r"(?<!\.)\.\s*(["'’»›”]) *$"
if ts.input[end] in ('\"', '', '', '', '', '') || isspace(ts.input[end])
while effective_end >=1 && isspace(ts.input[effective_end] )
effective_end -= 1
end
if effective_end > 1 && ts.input[effective_end] in ('\"', '', '', '', '', '')
token_position = effective_end
effective_end -= 1

if effective_end > 1 && ts.input[effective_end] in ('\"', '', '', '', '', '')
token_position = effective_end
while effective_end >=1 && isspace(ts.input[effective_end])
effective_end -= 1
end

while effective_end >=1 && isspace(ts.input[effective_end] )
if effective_end > 1 && ts.input[effective_end] == '.'
if effective_end >= 2 && ts.input[effective_end - 1] == '.'
return token_position + 1, nothing, nothing # No use iterating over spaces again.
else
effective_end -= 1
end

if effective_end > 1 && ts.input[effective_end] == '.'
if effective_end >= 2 && ts.input[effective_end - 1] == '.'
return length(ts.input), nothing, nothing
else
effective_end -= 1
return effective_end, ".",string(ts.input[token_position])
end
return effective_end, ".", string(ts.input[token_position])
end
end
return effective_end, string(ts.input[token_position]), nothing
end

return effective_end, nothing, nothing
end

Expand Down
24 changes: 17 additions & 7 deletions test/toktok.jl
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ french_tokenized = ["Maître", "Corbeau", ",", "sur", "un", "arbre", "perché",
"Monsieur", ",", "Apprenez", "que", "tout", "flatteur", "Vit", "aux", "dépens", "de", "celui", "qui", "l", "",
"écoute", ":", "Cette", "leçon", "vaut", "bien", "un", "fromage", ",", "sans", "doute.", "»", "Le", "Corbeau",
",", "honteux", "et", "confus", ",", "Jura", ",", "mais", "un", "peu", "tard", ",", "qu", "", "on", "ne", "l",
"", "y", "prendrait", "plus.", ]
"", "y", "prendrait", "plus", "."]


old_english_tokenized = ["An.", "M.LXVI.", "On", "þyssum", "geare", "man", "halgode", "þet", "mynster", "æt",
Expand All @@ -170,7 +170,7 @@ old_english_tokenized = ["An.", "M.LXVI.", "On", "þyssum", "geare", "man", "hal
"gefeaht", "ear", "þan", "þe", "his", "here", "com", "eall", "7", "þær", "he", "feoll", "7", "his", "twægen",
"gebroðra", "Gyrð", "7", "Leofwine", "and", "Willelm", "þis", "land", "geeode", "7", "com", "to", "Westmynstre",
"7", "Ealdred", "arceb", "hine", "to", "cynge", "gehalgode", "7", "menn", "guldon", "him", "gyld", "7", "gislas",
"sealdon", "7", "syððan", "heora", "land", "bohtan.", ]
"sealdon", "7", "syððan", "heora", "land", "bohtan", "."]


russian_tokenized = ["Лорем", "ипсум", "долор", "сит", "амет", ",", "яуи", "ин", "реяуе", "пертинациа", ",",
Expand All @@ -180,7 +180,7 @@ russian_tokenized = ["Лорем", "ипсум", "долор", "сит", "аме
"пертинах", "малуиссет", "ин", "усу.", "Еам", "еу", "еиус", "поссе.", "Сеа", "еи", "малорум", "ассентиор.", "Алии",
"мутат", "персиус", "усу", "но", ",", "цу", "вих", "ирацундиа", "цонсететур", ",", "цоррумпит", "форенсибус", "диссентиунт",
"но", "иус.", "Ессе", "цибо", "нонумес", "ин", "сеа.", "Доминг", "еурипидис", "модератиус", "сеа", "ут", ",",
"алии", "иллуд", "граецис", "ет", "сед.", "Цу", "путент", "десеруиссе", "еам", ".",]
"алии", "иллуд", "граецис", "ет", "сед.", "Цу", "путент", "десеруиссе", "еам", "."]


spanish_tokenized = ["Mentiría", "si", "dijera", "que", "era", "del", "todo", "nuevo", "el", "sentimiento", "de",
Expand All @@ -195,7 +195,7 @@ spanish_tokenized = ["Mentiría", "si", "dijera", "que", "era", "del", "todo", "
"lo", "irremediable.", "Despertarse", ",", "ventearse", "como", "un", "perro", "la", "vida", ",", "ocuparse",
"de", "sus", "asuntillos", ",", "sacar", "provecho", "de", "ellos", ",", "comer", ",", "beber", ",", "dormir.",
"Ahora", ",", "sólo", "ahora", ",", "cuando", "estaba", "de", "verdad", "solo", ",", "sabía", "que", "la", "vida",
"se", "escapa", "por", "las", "buenas", ",", "corre", "mucho", ]
"se", "escapa", "por", "las", "buenas", ",", "corre", "mucho"]


farsi_tokenized = ["مادهٔ", "بیست", "و", "ششم", "1", ")", "هر", "کس", "حق", "دارد", "که",
Expand All @@ -215,7 +215,7 @@ farsi_tokenized = ["مادهٔ", "بیست", "و", "ششم", "1", ")", "هر", "
"دارد", "آزادانه", "در", "زندگی", "فرهنگی", "اجتما", "عی", "شرکت", "کند", "،", "از",
"فنون", "و", "هنرها", "متمتع", "گردد", "و", "در", "پیشرفت", "علمی", "و", "فوائد", "آن",
"سهیم", "باشد.", "2", ")", "هر", "کس", "حق", "دارد", "از", "حمایت", "منافع", "معنوی",
"و", "مادی", "آثار", "علمی", "،", "فرهنگی", "یا", "هنری", "خود", "برخوردار", "شود", ".", ]
"و", "مادی", "آثار", "علمی", "،", "فرهنگی", "یا", "هنری", "خود", "برخوردار", "شود", "."]


chez_tokenized = ["Článek", "26", "Každý", "", "právo", "na", "vzdělání.", "Vzdělání", "nechť", "je", "bezplatné",
Expand All @@ -230,7 +230,7 @@ chez_tokenized = ["Článek", "26", "Každý", "má", "právo", "na", "vzdělán
"", "právo", "svobodně", "se", "účastnit", "kulturního", "života", "společnosti", ",", "úžívat", "plodů",
"umění", "a", "podílet", "se", "na", "vědeckém", "pokroku", "a", "jeho", "výtěžcích.", "Každý", "", "právo",
"na", "ochranu", "morálních", "a", "materiálních", "zájmů", ",", "které", "vyplývají", "z", "jeho", "vědecké",
",", "literární", "nebo", "umělecké", "tvorby.", ]
",", "literární", "nebo", "umělecké", "tvorby", "."]


vietnamese_tokenized = ["Điều", "26", ":", "1", ")", "Mọi", "người", "đều", "có", "quyền", "được", "học", "hành.",
Expand All @@ -250,7 +250,7 @@ vietnamese_tokenized = ["Điều", "26", ":", "1", ")", "Mọi", "người", "
"xẻ", "những", "thành", "tựu", "và", "lợi", "ích", "của", "tiến", "bộ", "khoa", "học.", "2", ")", "Mọi",
"người", "đều", "có", "quyền", "được", "bảo", "hộ", "đối", "với", "những", "quyền", "lợi", "về", "vật",
"chất", "và", "tinh", "thần", "xuất", "phát", "từ", "công", "trình", "khoa", "học", ",", "văn", "học",
"và", "nhgệ", "thuật", "mà", "người", "đó", "là", "tác", "giả.", ]
"và", "nhgệ", "thuật", "mà", "người", "đó", "là", "tác", "giả", "."]



Expand Down Expand Up @@ -284,6 +284,16 @@ end
tokenized = ["1", ")", "example", "sentence.", "2", ")", "example", "sentence", ""]
@test tokenized == toktok_tokenize(str)
end

@testset "spaces in final period" begin
str = "This is a sentence. "
tokenized = [ "This", "is", "a", "sentence", "."]
@test tokenized == toktok_tokenize(str)

str = "They say, \"This is a sentence . \" "
tokenized = [ "They", "say", ",", "\"", "This", "is", "a", "sentence", ".", "\""]
@test tokenized == toktok_tokenize(str)
end
end


Expand Down

0 comments on commit cc225ea

Please sign in to comment.