Skip to content

Commit

Permalink
dont allow words that may be abbrevations
Browse files Browse the repository at this point in the history
  • Loading branch information
eyaler committed Feb 16, 2022
1 parent 7171ff7 commit aac2653
Showing 1 changed file with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions hebrew_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def to_final(text):
return text.translate(to_final_table)

hebrew_diacritics = '\u0591-\u05bd\u05bf\u05c1\u05c2\u05c4\u05c5\u05c7' # all nikud and teamim except makaf, pasek, sof-pasuk, nun-hafukha
hebrew_diacritics_regex = re.compile(cc(hebrew_diacritics))
hebrew_diacritics_regex = re.compile(cc(hebrew_diacritics) + '+')
horizontal_space = ncc('\\S\t\n\r\f\v')
pasek_pattern = horizontal_space + '*' + '\u05c0' + horizontal_space + '*'
pasek_regex = re.compile(pasek_pattern)
Expand All @@ -73,12 +73,12 @@ def to_final(text):
final_letters = to_final.__func__(nonfinal_letters) + 'פ'
nonfinal_letters_allowing_geresh = 'גזצ'
final_letters_allowing_geresh = to_final.__func__(nonfinal_letters_allowing_geresh) + 'צ'
geresh = '\''
geresh = "'"
nonfinal_letter_geresh_pattern = ncg(cc(nonfinal_letters_allowing_geresh) + geresh + '|' + cc(nonfinal_letters))
final_letter_geresh_pattern = ncg(cc(final_letters_allowing_geresh) + geresh + '|' + cc(final_letters))
non_hebrew_letters_regex = re.compile(ncc(hebrew_letters) + '+')
bad_final_regex = re.compile(cc(final_chars) + cc(nonfinal_letters))
hashtag_regex = re.compile('#[\\w\'"\u05be\u05f3\u05f4-]+') # for performance we will not do unidecode sanitizaion so we accomodate makaf, geresh, gershaim explicitly
hashtag_regex = re.compile('#[\\w\'"\u05be\u05f3\u05f4-]+') # for performance we will not do unidecode sanitization so we accommodate makaf, geresh, gershaim explicitly

sentence_sep = '.?!'
clause_sep_before_space = sentence_sep + ':;,)"'
Expand Down Expand Up @@ -135,11 +135,11 @@ def __init__(self, max_char_repetition=default_max_char_repetition, max_end_of_w
if max_one_two_char_word_len:
short_or_diverse = '(?=' + cch + '{1,' + str(max_one_two_char_word_len) + '}\\b|' + cch + '*(?P<ref_char1>' + cch + ')(?!(?P=ref_char1))(?P<ref_char>' + cch + ')' + cch + '*(?!(?P=ref_char1))(?!(?P=ref_char))' + cch + '+)'
if self.allow_number_refs:
forbidden_trailing = '[^\\W\\d]'
forbidden_trailing = "[^\\W\\d]|'"
else:
forbidden_trailing = '\\w'
forbidden_trailing = "[\\w']"

self.word_pattern = '(?<!' + cch + '[^\\s-])\\b' + short_or_diverse + ncg('(?P<ref_char0>' + self.nonfinal_letter_geresh_pattern + ')' + neg_rep + neg_end_rep) + '+' + self.final_letter_geresh_pattern + '(?!' + forbidden_trailing + ')' + nla('[^\\s-]' + cch) + nla('-' + ncg('$|' + ncch))
self.word_pattern = '(?<!' + cch + '[^\\s-])\\b' + short_or_diverse + ncg('(?P<ref_char0>' + self.nonfinal_letter_geresh_pattern + ')' + neg_rep + neg_end_rep) + '+' + self.final_letter_geresh_pattern + nla(forbidden_trailing) + nla('[^\\s-]' + cch) + nla('-' + ncg('$|' + ncch))

reuse_cnt = {}

Expand Down Expand Up @@ -269,6 +269,7 @@ def get_mwe_ngrams(self, text, n, sanitize=True, strict=default_strict, as_strin
text = 'א בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ. ב וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְה֑וֹם, וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמָּֽיִם. ג וַיֹּ֥אמֶר אֱלֹהִ֖ים: "יְהִ֣י א֑וֹר", וַֽיְהִי־אֽוֹר. ד וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב, וַיַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָא֖וֹר וּבֵ֥ין הַחֹֽשֶׁךְ. ה וַיִּקְרָ֨א אֱלֹהִ֤ים ׀ לָאוֹר֙ "י֔וֹם" וְלַחֹ֖שֶׁךְ קָ֣רָא "לָ֑יְלָה", וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר י֥וֹם אֶחָֽד.'

output = ''

def print_with_len(lst):
global output
output += str(lst) + '\n'
Expand All @@ -284,7 +285,7 @@ def print_with_len(lst):
print_with_len(sanitize(text))
print_with_len(HebTokenizer.sanitize(text))

heb_tokenizer = HebTokenizer()
heb_tokenizer = HebTokenizer(allow_number_refs=True)
print_with_len(heb_tokenizer.sanitize(text))
print_with_len(heb_tokenizer.get_words(text))
print('has_word=', heb_tokenizer.has_word(text))
Expand All @@ -303,4 +304,3 @@ def print_with_len(lst):

myhash = hashlib.md5(output.encode()).hexdigest()
assert myhash == saved_hash, myhash

0 comments on commit aac2653

Please sign in to comment.