From fdd32107e64e8e7014247247799713da00069d6a Mon Sep 17 00:00:00 2001 From: eyaler Date: Sun, 30 Jan 2022 03:08:38 +0200 Subject: [PATCH] revert api; add hash test; add optional number references --- README.md | 3 +- hebrew_tokenizer.py | 68 ++++++++++++++++++++++++++++----------------- 2 files changed, 45 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index aeb4015..c9b8380 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # hebrew_tokenizer -A field-tested Hebrew tokenizer for dirty texts (bible, twitter, opensubs, oscar, cc100, mc4) focused on multi-word expression extraction. +A field-tested Hebrew tokenizer for dirty texts (ben-yehuda project, bible, cc100, mc4, opensubs, oscar, twitter) focused on multi-word expression extraction. * Nikud and Teamim are ignored. * Punctuation is normalized to ASCII (using unidecode). @@ -17,3 +17,4 @@ A field-tested Hebrew tokenizer for dirty texts (bible, twitter, opensubs, oscar * Line opening hyphens as used in conversation and enumeration, can be ignored by allow_line_opening_hyphens (default=True) * Strict mode can enforce the absence of extraneous hebrew letters in the same "clause" (strict=HebTokenizer.CLAUSE), sentence (strict=HebTokenizer.SENTENCE) or line (strict=HebTokenizer.LINE) of the MWE. Use 0 or None to not be strict (default=None). +* Optionally allow number references with allow_number_refs (default=False). \ No newline at end of file diff --git a/hebrew_tokenizer.py b/hebrew_tokenizer.py index c8a0ba5..7c7bdaf 100644 --- a/hebrew_tokenizer.py +++ b/hebrew_tokenizer.py @@ -2,6 +2,7 @@ from functools import partialmethod +import hashlib import re from unidecode import unidecode_expect_nonascii @@ -30,7 +31,7 @@ def nla(s): class HebTokenizer: - """A field-tested Hebrew tokenizer for dirty texts (bible, twitter, opensubs, oscar, cc100, mc4) focused on multi-word expression extraction. + """A field-tested Hebrew tokenizer for dirty texts (ben-yehuda project, bible, cc100, mc4, opensubs, oscar, twitter) focused on multi-word expression extraction. Nikud and Teamim are ignored. Punctuation is normalized to ASCII (using unidecode). @@ -48,6 +49,7 @@ class HebTokenizer: Line opening hyphens as used in conversation and enumeration, can be ignored by allow_line_opening_hyphens (default=True) Strict mode can enforce the absence of extraneous hebrew letters in the same "clause" (strict=HebTokenizer.CLAUSE), sentence (strict=HebTokenizer.SENTENCE) or line (strict=HebTokenizer.LINE) of the MWE. Use 0 or None to not be strict (default=None). + Optionally allow number references with allow_number_refs (default=False). """ @staticmethod @@ -66,7 +68,6 @@ def to_final(text): sofpasuk_pattern = horizontal_space + '*' + '\u05c3' + horizontal_space + '*' sofpasuk_regex = re.compile(sofpasuk_pattern) - hebrew_letters = 'א-ת' nonfinal_letters = 'אבגדהוזחטיכלמנסעפצקרשת' final_letters = to_final.__func__(nonfinal_letters) + 'פ' @@ -104,16 +105,18 @@ def to_final(text): default_max_one_two_char_word_len = 7 # based on Hspell. e.g. שישישיי default_max_mwe_hyphens = 1 default_allow_line_opening_hyphens = True + default_allow_number_refs = False default_strict = None default_bad_final_exceptions = 'לםרבה', 'אנשיםות', 'יוםיום', 'סוףסוף' # note: these exceptions are only for finding bad finals. the tokenizer will still ignore them - def __init__(self, max_char_repetition=default_max_char_repetition, max_end_of_word_char_repetition=default_max_end_of_word_char_repetition, allow_mmm=default_allow_mmm, max_one_two_char_word_len=default_max_one_two_char_word_len, max_mwe_hyphens=default_max_mwe_hyphens, allow_line_opening_hyphens=default_allow_line_opening_hyphens): + def __init__(self, max_char_repetition=default_max_char_repetition, max_end_of_word_char_repetition=default_max_end_of_word_char_repetition, allow_mmm=default_allow_mmm, max_one_two_char_word_len=default_max_one_two_char_word_len, max_mwe_hyphens=default_max_mwe_hyphens, allow_line_opening_hyphens=default_allow_line_opening_hyphens, allow_number_refs=default_allow_number_refs): self.max_char_repetition = max_char_repetition self.max_end_of_word_char_repetition = max_end_of_word_char_repetition self.allow_mmm = allow_mmm self.max_one_two_char_word_len = max_one_two_char_word_len self.max_mwe_hyphens = max_mwe_hyphens self.allow_line_opening_hyphens = allow_line_opening_hyphens + self.allow_number_refs = allow_number_refs mmm = '' neg_rep = '' @@ -131,7 +134,12 @@ def __init__(self, max_char_repetition=default_max_char_repetition, max_end_of_w neg_end_rep = nla('(?P=ref_char0){' + str(max_end_of_word_char_repetition) + '}' + ncg('$|' + ncch)) if max_one_two_char_word_len: short_or_diverse = '(?=' + cch + '{1,' + str(max_one_two_char_word_len) + '}\\b|' + cch + '*(?P' + cch + ')(?!(?P=ref_char1))(?P' + cch + ')' + cch + '*(?!(?P=ref_char1))(?!(?P=ref_char))' + cch + '+)' - self.word_pattern = '(?' + self.nonfinal_letter_geresh_pattern + ')' + neg_rep + neg_end_rep) + '+' + self.final_letter_geresh_pattern + '(?!\\w)' + nla('[^\\s-]' + cch) + nla('-' + ncg('$|' + ncch)) + if self.allow_number_refs: + forbidden_trailing = '[^\\W\\d]' + else: + forbidden_trailing = '\\w' + + self.word_pattern = '(?' + self.nonfinal_letter_geresh_pattern + ')' + neg_rep + neg_end_rep) + '+' + self.final_letter_geresh_pattern + '(?!' + forbidden_trailing + ')' + nla('[^\\s-]' + cch) + nla('-' + ncg('$|' + ncch)) reuse_cnt = {} @@ -160,8 +168,8 @@ def remove_diacritics(cls, text): return cls.hebrew_diacritics_regex.sub('', text) @classmethod - def sanitize(cls, text, is_without_diacritics=False, bible_makaf=False): - if not is_without_diacritics: + def sanitize(cls, text, remove_diacritics=True, bible_makaf=False): + if remove_diacritics: text = cls.remove_diacritics(text) if bible_makaf: text = text.replace('\u05be', ' ') # for biblical texts makaf is a taam and does not signify hyphenation @@ -170,8 +178,8 @@ def sanitize(cls, text, is_without_diacritics=False, bible_makaf=False): return cls.non_hebrew_letters_regex.sub(lambda x: unidecode_expect_nonascii(x.group(), errors='preserve'), text) @classmethod - def find_bad_final(cls, text, is_without_diacritics=False, exceptions=default_bad_final_exceptions, allow_hashtag=True, ret_all=False): # this could help detect text containing badly fused words or lines - if not is_without_diacritics: + def find_bad_final(cls, text, remove_diacritics=True, exceptions=default_bad_final_exceptions, allow_hashtag=True, ret_all=False): # this could help detect text containing badly fused words or lines + if remove_diacritics: text = cls.remove_diacritics(text) if allow_hashtag: text = cls.hashtag_regex.sub('', text) @@ -181,36 +189,36 @@ def find_bad_final(cls, text, is_without_diacritics=False, exceptions=default_ba return cls.bad_final_regex.findall(text) return cls.bad_final_regex.search(text) - def is_word(self, text, is_sanitized=False): - if not is_sanitized: + def is_word(self, text, sanitize=True): + if sanitize: text = self.sanitize(text) return bool(self.word_regex.fullmatch(text)) - def get_words(self, text, is_sanitized=False, iterator=False): - if not is_sanitized: + def get_words(self, text, sanitize=True, iterator=False): + if sanitize: text = self.sanitize(text) result = (match.group() for match in self.word_regex.finditer(text)) if not iterator: result = list(result) return result - def has_word(self, text, is_sanitized=False): - for _ in self.get_words(text, is_sanitized=is_sanitized, iterator=True): + def has_word(self, text, sanitize=True): + for _ in self.get_words(text, sanitize=sanitize, iterator=True): return True return False - def is_mwe(self, text, is_sanitized=False): - if not is_sanitized: + def is_mwe(self, text, sanitize=True): + if sanitize: text = self.sanitize(text) return bool(self.mwe_regex.fullmatch(text)) - def is_word_or_mwe(self, text, is_sanitized=False): - if not is_sanitized: + def is_word_or_mwe(self, text, sanitize=True): + if sanitize: text = self.sanitize(text) - return self.is_word(text, is_sanitized=True) or self.is_mwe(text, is_sanitized=True) + return self.is_word(text, sanitize=False) or self.is_mwe(text, sanitize=False) - def get_mwe(self, text, is_sanitized=False, strict=default_strict, iterator=False): - if not is_sanitized: + def get_mwe(self, text, sanitize=True, strict=default_strict, iterator=False): + if sanitize: text = self.sanitize(text) if self.allow_line_opening_hyphens: text = self.line_opening_hyphen_regex.sub('\\1 ', text) @@ -229,16 +237,16 @@ def get_mwe(self, text, is_sanitized=False, strict=default_strict, iterator=Fals result = list(result) return result - def get_mwe_words(self, text, is_sanitized=False, strict=default_strict, flat=False, iterator=False): - result = (self.mwe_words_sep_regex.split(mwe) for mwe in self.get_mwe(text, is_sanitized=is_sanitized, strict=strict)) + def get_mwe_words(self, text, sanitize=True, strict=default_strict, flat=False, iterator=False): + result = (self.mwe_words_sep_regex.split(mwe) for mwe in self.get_mwe(text, sanitize=sanitize, strict=strict)) if flat: result = (word for word_list in result for word in word_list) if not iterator: result = list(result) return result - def get_mwe_ngrams(self, text, n, is_sanitized=False, strict=default_strict, as_strings=False, flat=False, iterator=False): - words = self.get_mwe_words(text, is_sanitized=is_sanitized, strict=strict, flat=False, iterator=iterator) + def get_mwe_ngrams(self, text, n, sanitize=True, strict=default_strict, as_strings=False, flat=False, iterator=False): + words = self.get_mwe_words(text, sanitize=sanitize, strict=strict, flat=False, iterator=iterator) result = ([' '.join(word_list[i : i + n]) if as_strings else tuple(word_list[i : i + n]) for i in range(len(word_list) - n + 1)] for word_list in words if len(word_list) >= n) if flat: result = (ngram for ngram_list in result for ngram in ngram_list) @@ -260,9 +268,14 @@ def get_mwe_ngrams(self, text, n, is_sanitized=False, strict=default_strict, as_ text = 'א בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ. ב וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְה֑וֹם, וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמָּֽיִם. ג וַיֹּ֥אמֶר אֱלֹהִ֖ים: "יְהִ֣י א֑וֹר", וַֽיְהִי־אֽוֹר. ד וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב, וַיַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָא֖וֹר וּבֵ֥ין הַחֹֽשֶׁךְ. ה וַיִּקְרָ֨א אֱלֹהִ֤ים ׀ לָאוֹר֙ "י֔וֹם" וְלַחֹ֖שֶׁךְ קָ֣רָא "לָ֑יְלָה", וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר י֥וֹם אֶחָֽד.' + output = '' def print_with_len(lst): + global output + output += str(lst) + '\n' print(lst, len(lst)) + saved_hash = '8aae9ff77125d5e0516b8f869c06f023' + print_with_len(text) print_with_len(to_final(text)) print_with_len(to_nonfinal(text)) @@ -286,3 +299,8 @@ def print_with_len(lst): print_with_len(heb_tokenizer.get_mwe_ngrams(text, n=3, as_strings=True)) print_with_len(heb_tokenizer.get_mwe_ngrams(text, n=3, flat=True)) print_with_len(heb_tokenizer.get_mwe_ngrams(text, n=3, as_strings=True, flat=True)) + print_with_len(heb_tokenizer.get_mwe_ngrams(text, n=3, as_strings=True, flat=True)) + + myhash = hashlib.md5(output.encode()).hexdigest() + assert myhash == saved_hash, myhash +