From d7a74fa58b2ad6a377c0fe6a8d7ebecb54f1d441 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Mon, 9 Nov 2020 17:29:40 +0100 Subject: [PATCH 01/21] First draft of flexible character accuracy --- pytest.ini | 4 - .../flexible_character_accuracy.py | 394 ++++++++++++++++++ .../tests/test_flexible_character_accuracy.py | 291 +++++++++++++ 3 files changed, 685 insertions(+), 4 deletions(-) delete mode 100644 pytest.ini create mode 100644 qurator/dinglehopper/flexible_character_accuracy.py create mode 100644 qurator/dinglehopper/tests/test_flexible_character_accuracy.py diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index c56273f..0000000 --- a/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -[pytest] -markers = - integration: integration tests - serial diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py new file mode 100644 index 0000000..3e4bb93 --- /dev/null +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -0,0 +1,394 @@ +""" +Implementation of the flexible character accuracy + +Citation: + Flexible character accuracy measure for reading-order-independent evaluation + C. Clausner, S. Pletschacher, A. Antonacopoulos + Pattern Recognition Letters, Volume 131, March 2020, Pages 390-397 +Link: http://www.primaresearch.org/publications/PRL_Clausner_FlexibleCharacterAccuracy +DOI: https://doi.org/10.1016/j.patrec.2020.02.003 + +Note that we deviated from the original algorithm at some places. +""" + +from collections import Counter +from functools import lru_cache, reduce +from itertools import product, takewhile +from typing import List, NamedTuple, Tuple, Optional + +from . import editops + + +def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List["Match"]]: + """Calculate the flexible character accuracy. + + Reference: contains steps 1-7 of the flexible character accuracy algorithm. + + :param gt: The ground truth text. + :param ocr: The text to compare the ground truth with. + :return: Score between 0 and 1 and match objects. + """ + + best_score = -float('inf') + best_matches = [] + # TODO: this should be configurable + combinations = product(range(15, 31, 5), + range(0, 24, 3), + range(0, 4, 1), + range(0, 6, 1)) + # TODO: place to parallelize the algorithm + for (edit_dist, length_diff, offset, length) in combinations: + coef = Coefficients( + edit_dist=edit_dist, + length_diff=length_diff, + offset=offset, + length=length + ) + # Steps 1 - 6 of the flexible character accuracy algorithm. + matches = match_with_coefficients(gt, ocr, coef) + # Step 7 of the flexible character accuracy algorithm. + score = character_accuracy_for_matches(matches) + if score > best_score: + best_score = score + best_matches = matches + # early breaking: we only need one perfect fit + if best_score >= 1: + break + return best_score, best_matches + + +def match_with_coefficients(gt: str, ocr: str, coef: "Coefficients") -> List["Match"]: + """Match ground truth with ocr and considers a given set of coefficients. + + Reference: contains steps 1 - 6 of the flexible character accuracy algorithm. + + :return: A list of match objects to score and align the texts. + """ + # Steps 1 and 2 of the flexible character accuracy algorithm. + ocr_lines = initialize_lines(ocr) + gt_lines = initialize_lines(gt) + + matches = [] + + # Step 5 of the flexible character accuracy algorithm. + while len(gt_lines) != 0 and len(ocr_lines) != 0: + # Steps 3 and 4 of the flexible character accuracy algorithm. + match = match_longest_gt_lines(gt_lines, ocr_lines, coef) + if match: + matches.append(match) + + # Step 6 of the flexible character accuracy algorithm. + # remaining lines are considered as deletes and inserts + deletes = [distance(line, Part(text="", line=line.line, start=line.start)) + for line in gt_lines] + inserts = [distance(Part(text="", line=line.line, start=line.start), line) + for line in ocr_lines] + + return [*matches, *deletes, *inserts] + + +def match_longest_gt_lines(gt_lines: List["Part"], + ocr_lines: List["Part"], + coef: "Coefficients") -> Optional["Match"]: + """Find the best match for the longest line(s) in ground truth. + + The longest lines in ground truth are matched against lines in ocr to find the + best matching pair. This pair is then either considered a match on full line + + Reference: contains steps 3 and 4 of the flexible character accuracy algorithm. + + :return: Possible match object. + """ + best_score, best_match, best_gt, best_ocr = -float('inf'), None, None, None + if not ocr_lines: + return best_match + + # Step 3 of the flexible character accuracy algorithm (variation). + # Instead of the longest line we take all longest lines with equal length. + length = min(gt_lines[0].length, ocr_lines[0].length) + for gt_line in takewhile(lambda line: line.length >= length, gt_lines): + match, ocr_line = match_gt_line(gt_line, ocr_lines, coef) + score = 0 if not match else character_accuracy(match.dist) + if score > best_score: + best_score, best_match, best_gt, best_ocr = score, match, gt_line, ocr_line + + # Step 4 of the flexible character accuracy algorithm. + # Remove on full match or split. + if best_match and best_gt: + splitted = remove_or_split(best_gt, best_match.gt, gt_lines) + if splitted: + gt_lines.append(best_match.gt) + best_match = None + if best_match and best_ocr: + remove_or_split(best_ocr, best_match.ocr, ocr_lines) + + return best_match + + +def match_gt_line(gt_line: "Part", + ocr_lines: List["Part"], + coef: "Coefficients") -> Tuple[Optional["Match"], + Optional["Part"]]: + """Match the given ground truth line against all the lines in ocr. + + Reference: contains steps 3 of the flexible character accuracy algorithm. + + TODO: Make penalty function configurable? + TODO: Add empty ocr line to avoid having nonesense one character alignments? + + :return: Match object and the matched ocr line. + """ + min_penalty = float('inf') + best_match, best_ocr = None, None + for ocr_line in ocr_lines: + match = match_lines(gt_line, ocr_line) + penalty = calculate_penalty(gt_line, ocr_line, match, coef) + if penalty < min_penalty: + min_penalty, best_match, best_ocr = penalty, match, ocr_line + return best_match, best_ocr + + +def remove_or_split(original: "Part", + match: "Part", + lines: List["Part"]) -> bool: + """Removes the matched line or splits it into parts. + + Reference: contains step 4 of the flexible character accuracy algorithm. + + :return: True if line was splitted. + """ + splitted = False + del lines[lines.index(original)] + if match.length < original.length: + lines.extend(original.split(match)) + # sorting for ocr is not mentioned in the paper, but is used as tie breaking =) + lines.sort(key=lambda x: x.length, reverse=True) + splitted = True + return splitted + + +@lru_cache(maxsize=1000000) +def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional["Match"]: + """Matches two lines searching for a local alignment. + + The shorter line is moved along the longer line + until the editing distance is minimized. + + Reference: see figure 2 in the paper. + + TODO: make distance function configurable? + + :return: Match object if one is found. + """ + min_length = min(gt_line.length, ocr_line.length) + best_match = None + if min_length == 0: + return best_match + length_diff = gt_line.length - ocr_line.length + min_edit_dist = float('inf') + # TODO: handle deletes and replacements by extending the length. + for i in range(0, max(1, length_diff + 1)): + for j in range(0, max(1, -1 * length_diff + 1)): + match = distance(gt_line.substring(rel_start=i, rel_end=i + min_length), + ocr_line.substring(rel_start=j, rel_end=j + min_length)) + edit_dist = score_edit_distance(match) + if edit_dist < min_edit_dist: + min_edit_dist = edit_dist + best_match = match + return best_match + + +@lru_cache(maxsize=1000000) +def distance(gt: "Part", ocr: "Part") -> "Match": + """Calculate the editing distance between the two lines. + + Using the already available `editops()` function with the Levenshtein distance. + + TODO: replace with @cache annotation in Python 3.9 + + :return: Match object containing the lines and the editing operations. + """ + ops = editops(gt.text, ocr.text) + edits = Counter([edit[0] for edit in ops]) + edits["match"] = gt.length - edits["delete"] - edits["replace"] + return Match(gt=gt, ocr=ocr, dist=Distance(**edits), ops=ops) + + +def score_edit_distance(match: "Match") -> int: + """Calculate edit distance for a match. + + Formula: $deletes + inserts + 2 * replacements$ + + :return: Sum of deletes, inserts and replacements. + """ + return match.dist.delete + match.dist.insert + 2 * match.dist.replace + + +def calculate_penalty(gt: "Part", ocr: "Part", match: "Match", + coef: "Coefficients") -> float: + """Calculate the penalty for a given match. + + For details and discussion see Section 3 in doi:10.1016/j.patrec.2020.02.003. + + :return: Penalty for the given match. + """ + min_edit_dist = score_edit_distance(match) + length_diff = abs(gt.length - ocr.length) + substring_length = min(gt.length, ocr.length) + offset = 0.0 + if length_diff > 1: + substring_pos = max(match.gt.start - gt.start, match.ocr.start - ocr.start) + offset = length_diff / 2 - abs(substring_pos - length_diff / 2) + return (min_edit_dist * coef.edit_dist + + length_diff * coef.length_diff + + offset * coef.offset + - substring_length * coef.length) + + +def character_accuracy_for_matches(matches: List["Match"]) -> float: + """Character accuracy of a full text represented by a list of matches. + + See other `character_accuracy` for details. + + """ + agg: Counter = reduce(lambda acc, match: acc + Counter(match.dist._asdict()), + matches, Counter()) + + score = character_accuracy(Distance(**agg)) + return score + + +def character_accuracy(edits: "Distance") -> float: + """Character accuracy calculated by necessary edit operations. + + Edit operations are needed edits to transform one text into another. + + The character accuracy is given by $1 - errors / characters$. + + Errors are replacements, deletes and inserts. + + Note that is is possible to have more errors than characters in which case the + character accuracy turns negative. + + Comparing two empty strings (having no edits) results in a character accuracy of 1. + """ + errors = edits.replace + edits.delete + edits.insert + chars = edits.match + edits.replace + edits.delete + if not chars and not errors: + # comparison of empty strings is considered a full match + score = 1 + else: + score = 1 - errors / chars + return score + + +def initialize_lines(text: str) -> List["Part"]: + """Splits a text into lines and converts them to our line data object. + + The line objects are sorted by their length descending. + + Reference: contains steps 1 and 2 of the flexible character accuracy algorithm. + + :param text: Text to split into lines. + :return: List of sorted line objects. + """ + lines = [Part(text=line, line=i, start=0) + for i, line in enumerate(text.splitlines()) + if len(line) > 0] + lines.sort(key=lambda x: x.length, reverse=True) + return lines + + +def combine_lines(matches: List["Match"]) -> Tuple[str, str]: + """Combines the matches to aligned texts. + + TODO: just hacked, needs tests and refinement. Also missing insert/delete marking. + + :param matches: List of match objects. + :return: the aligned ground truth and ocr as texts. + """ + matches.sort(key=lambda x: x.gt.line + x.gt.start / 10000) + line = 0 + gt, ocr = "", "" + for match in matches: + if match.gt.line > line: + gt += "\n" + ocr += "\n" + line += 1 + gt += match.gt.text + ocr += match.ocr.text + return gt, ocr + + +class Part(NamedTuple): + """Represent a line or part of a line. + + This data object is maintained to be able to reproduce the original text. + """ + text: str = "" + line: int = 0 + start: int = 0 + + @property + def end(self) -> int: + return self.start + self.length + + @property + def length(self) -> int: + return len(self.text) + + def split(self, split: "Part") -> List["Part"]: + """Split the line part by another and returns the remaining parts. + + `abc.split("b")` will return ´["a", "c"]`. + + :param split: The line part we want to use to split. + :return: The parts before and after the split. + """ + rest = [] + if self.start < split.start: + rest.append(self.substring(rel_end=split.start - self.start)) + if split.end < self.end: + rest.append(self.substring(rel_start=split.end - self.start)) + return rest + + def substring(self, rel_start: int = 0, rel_end: int = None) -> "Part": + """Get part of the given line. + + Automatically handles the offset of the line. + Therefore `substring(rel_start=2)` will return `Part[start+rel_start:]`. + + :param rel_start: start relative to the part of the line. + :param rel_end: end relative to the part of the line. + :return: Extracted part of the given part of the line. + """ + text = self.text[rel_start:rel_end] + start = self.start + rel_start + return Part(text=text, line=self.line, start=start) + + +class Distance(NamedTuple): + """Represent distance between two sequences.""" + match: int = 0 + replace: int = 0 + delete: int = 0 + insert: int = 0 + + +class Match(NamedTuple): + """Represent a calculated match between ground truth and the ocr result.""" + gt: "Part" + ocr: "Part" + dist: "Distance" + ops: List + + +class Coefficients(NamedTuple): + """Coefficients to calculate penalty for substrings. + + See Section 3 in doi:10.1016/j.patrec.2020.02.003 + """ + edit_dist: int = 25 + length_diff: int = 20 + offset: int = 1 + length: int = 4 diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py new file mode 100644 index 0000000..6393e2f --- /dev/null +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -0,0 +1,291 @@ +""" +Tests for the implementation of the flexible character accuracy + +Citation: + Flexible character accuracy measure for reading-order-independent evaluation + C. Clausner, S. Pletschacher, A. Antonacopoulos + Pattern Recognition Letters, Volume 131, March 2020, Pages 390-397 +Link: http://www.primaresearch.org/publications/PRL_Clausner_FlexibleCharacterAccuracy +DOI: 10.1016/j.patrec.2020.02.003 +""" + +import pytest + +from ..flexible_character_accuracy import * + +CASE_ARGS = "gt,ocr,first_line_score,all_line_score" + +SIMPLE_CASES = [ + ("a", "", 0, 0), + ("a", "a", 1, 1), + ("a\nb", "a\nb", 1, 1), + ("a\nb", "b\na", 1, 1), + ("aaa\nbbb\nccc", "ccc\naaa\nbbb", 1, 1), + ("aaa\nbbb\nccc", "aaa\nbbb", 1, 1 - 3 / 9), + ("bbb", "aaa\nbbb\nccc", 1, 1 - 6 / 3), + ("a", "a\nbb\nccc", 1, 1 - 5 / 1), + ("bb", "a\nbb\nccc", 1, 1 - 4 / 2), +] + +COMPLEX_CASES = [ + ("accc", "a\nbb\nccc", 0, 1 - 2 / 4), + ("aaa\nbbb\nccc", "bbb", 1, 1 - 6 / 9), +] + +EXTENDED_CASES = [ + # A: No errors + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), + (0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), + 1, 1), + # B: Different ordering of text blocks + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), + (5, 6, 7, 8, 9, 11, 0, 1, 2, 3, 4), + 1, 1), + # C: Merge across columns + ((0, 1, 2, 11, 3, 4, 11, 5, 6, 7, 11, 8, 9), + (0, 1, 2, 5, 6, 7, 11, 3, 4, 8, 9), + 1, 0.964), + # D: Over-segmentation + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), + (0, 1, 2, 11, 5, 6, 7, 11, 3, 4, 11, 8, 9), + 1, 0.966), + # E: Part missing + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), + (0, 1, 2, 3, 4), + 1, 0.50), + # E.2: Part missing + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), + (5, 6, 7, 8, 9), + 1, 0.50), + # F: All missing + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), + (), + 1, 0), + # G: Added parts + ((0, 1, 2, 3, 4), + (0, 1, 2, 3, 4, 11, 5, 6), + 1, 0.621), +] + +EDIT_ARGS = "gt,ocr,expected_dist" + +SIMPLE_EDITS = [ + (Part(text="a").substring(), Part(text="a"), Distance(match=1)), + (Part(text="a").substring(), Part(text="b"), Distance(replace=1)), + (Part(text="abcd").substring(), Part(text="beed"), + Distance(match=2, replace=1, insert=1, delete=1)), +] + + +def extended_case_to_text(gt, ocr): + sentence = ("Eight", "happy", "frogs", "scuba", "dived", + "Jenny", "chick", "flaps", "white", "wings", + "", "\n") + + gt_sentence = " ".join(sentence[i] for i in gt).replace(" \n ", "\n") + ocr_sentence = " ".join(sentence[i] for i in ocr).replace(" \n ", "\n") + return gt_sentence, ocr_sentence + + +@pytest.mark.parametrize(CASE_ARGS, [*SIMPLE_CASES, *COMPLEX_CASES]) +def test_flexible_character_accuracy_simple(gt, ocr, first_line_score, all_line_score): + score, _ = flexible_character_accuracy(gt, ocr) + assert score == pytest.approx(all_line_score) + + +@pytest.mark.xfail +@pytest.mark.parametrize("ocr", [ + "1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"\nAlberto\nEmstein", + "1 hav\nnospecial\ntalents. Alberto\nI am one Emstein\npassionate\ncuriousity.\"", + "Alberto\nEmstein\n1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"" +]) +def test_flexible_character_accuracy(ocr): + """Tests from figure 3 in the paper. + + TODO: We have a 2 percent deviation from the original because of redistributed + one character alignments (e.g. the y-insert replaces the y-delete). + """ + gt = """"I have +no special +talent. +I am only +passionately +curious." +Albert +Einstein +""" + replacements = 3 + inserts = 5 + deletes = 7 + chars = len(gt) - gt.count("\n") + assert replacements + inserts + deletes == 15 + edits = Distance(match=chars - deletes - replacements, replace=replacements, + insert=inserts, delete=deletes) + expected = character_accuracy(edits) + assert expected == pytest.approx(0.779, abs=0.0005) + result, matches = flexible_character_accuracy(gt, ocr) + assert result == pytest.approx(expected, abs=0.0005) + + +@pytest.mark.parametrize(CASE_ARGS, EXTENDED_CASES) +def test_flexible_character_accuracy_extended(gt, ocr, first_line_score, + all_line_score): + """Tests from figure 4 in the paper.""" + gt_sentence, ocr_sentence = extended_case_to_text(gt, ocr) + result, _ = flexible_character_accuracy(gt_sentence, ocr_sentence) + assert result == pytest.approx(all_line_score, abs=0.001) + + +@pytest.mark.parametrize(CASE_ARGS, [*SIMPLE_CASES, *COMPLEX_CASES, *EXTENDED_CASES]) +def test_match_with_coefficients(gt, ocr, first_line_score, all_line_score): + coef = Coefficients() + if not isinstance(gt, str): + gt, ocr = extended_case_to_text(gt, ocr) + matches = match_with_coefficients(gt, ocr, coef) + score = character_accuracy_for_matches(matches) + assert score == pytest.approx(all_line_score, abs=0.001) + + +@pytest.mark.parametrize(CASE_ARGS, [*SIMPLE_CASES, *COMPLEX_CASES]) +def test_match_longest_gt_lines(gt, ocr, first_line_score, all_line_score): + coef = Coefficients() + gt_lines = initialize_lines(gt) + ocr_lines = initialize_lines(ocr) + match = match_longest_gt_lines(gt_lines, ocr_lines, coef) + score = 0 + if match: + score = character_accuracy(match.dist) + assert score == pytest.approx(first_line_score) + + +@pytest.mark.parametrize(CASE_ARGS, [ + *SIMPLE_CASES, + ("accc", "a\nbb\nccc", 1.0, 1.0), +]) +def test_match_gt_line(gt, ocr, first_line_score, all_line_score): + coef = Coefficients() + gt_lines = initialize_lines(gt) + ocr_lines = initialize_lines(ocr) + match, _ = match_gt_line(gt_lines[0], ocr_lines, coef) + score = 0 + if match: + score = character_accuracy(match.dist) + assert score == pytest.approx(first_line_score) + + +@pytest.mark.parametrize("original,match,expected_lines", [ + (Part(), Part(), []), + (Part(text="abc"), Part(), [Part(text="abc")]), + (Part(text="abc"), Part("d"), [Part(text="bc", start=1)]), + (Part(text="abc"), Part("a", start=100), [Part(text="abc")]), + (Part(text="abc"), Part("a"), [Part(text="bc", start=1)]), + (Part(text="abc"), Part("b", start=1), [Part(text="a"), Part(text="c", start=2)]), + (Part(text="abc"), Part("c", start=2), [Part(text="ab")]), +]) +def test_remove_or_split(original, match, expected_lines): + lines = [original] + splitted = remove_or_split(original, match, lines) + assert splitted == (len(lines) > 0) + assert lines == expected_lines + + +@pytest.mark.parametrize(EDIT_ARGS, [ + *SIMPLE_EDITS, + (Part(text="aaabbbaaa"), Part(text="bbb"), Distance(match=3)), + (Part(text="bbb"), Part(text="aaabbbaaa"), Distance(match=3)), + (Part(text=""), Part(text=""), None) +]) +def test_match_lines(gt, ocr, expected_dist): + match = match_lines(gt, ocr) + if not expected_dist: + assert match is None + else: + assert match.gt.text in gt.text + assert match.ocr.text in ocr.text + assert match.dist == expected_dist + + +@pytest.mark.parametrize(EDIT_ARGS, [ + *SIMPLE_EDITS, + (Part(text="").substring(), Part(text=""), Distance()), + (Part(text="ab").substring(), Part("a"), Distance(match=1, delete=1)), + (Part(text="a").substring(), Part("ab"), Distance(match=1, insert=1)), +]) +def test_distance(gt, ocr, expected_dist): + match = distance(gt, ocr) + assert match.gt == gt + assert match.ocr == ocr + assert match.dist == expected_dist + + +@pytest.mark.parametrize("matches,expected_dist", [ + ([], 1), + ([Match(gt=Part(text=""), ocr=Part(text=""), dist=Distance(), ops=[])], 1), + ([Match(gt=Part(text="abee"), ocr=Part("ac"), + dist=Distance(match=1, replace=1, delete=2), ops=[]), + Match(gt=Part(text="cd"), ocr=Part("ceff"), + dist=Distance(match=1, replace=1, insert=2), ops=[])], + 1 - 6 / 6), +]) +def test_character_accuracy_matches(matches, expected_dist): + assert character_accuracy_for_matches(matches) == pytest.approx(expected_dist) + + +@pytest.mark.parametrize("dist,expected_dist", [ + (Distance(), 1), + (Distance(match=1), 1), + (Distance(replace=1), 0), + (Distance(match=1, insert=1), 0), + (Distance(match=1, insert=2), 1 - 2 / 1), + (Distance(match=2, insert=1), 0.5), + (Distance(match=1, delete=1), 0.5), +]) +def test_character_accuracy_dist(dist, expected_dist): + assert character_accuracy(dist) == pytest.approx(expected_dist) + + +@pytest.mark.parametrize("line,subline,expected_rest", [ + (Part(), Part(), []), + (Part("aaa bbb"), Part("aaa bbb"), []), + (Part("aaa bbb"), Part("aaa"), [Part(" bbb", start=3)]), + (Part("aaa bbb"), Part("bbb", start=4), [Part("aaa ")]), + (Part("aaa bbb", start=3), Part("aaa", start=3), [Part(" bbb", start=6)]), + (Part("aaa bbb", start=3), Part("bbb", start=7), [Part("aaa ", start=3)]), + (Part("aaa bbb ccc"), Part("bbb", start=4), [Part("aaa "), Part(" ccc", start=7)]), + (Part("aaa bbb ccc", start=3), Part("bbb", start=7), + [Part("aaa ", start=3), Part(" ccc", start=10)]), + (Part("aaa bbb"), Part(" ", start=3), [Part("aaa"), Part("bbb", start=4)]), + (Part("aaa bbb", start=3), Part(" ", start=6), + [Part("aaa", start=3), Part("bbb", start=7)]), +]) +def test_split_line(line, subline, expected_rest): + rest = line.split(subline) + assert len(rest) == len(expected_rest) + assert set(rest) == set(expected_rest) + + +def test_initialize_lines(): + lines = initialize_lines("") + assert lines == [] + + lines = initialize_lines("22\n1\n333") + line1 = Part(text="22", line=0, start=0) + line2 = Part("1", line=1, start=0) + line3 = Part("333", line=2, start=0) + assert lines == [line3, line1, line2] + + +@pytest.mark.xfail +def test_combine_lines(): + assert False + + +@pytest.mark.parametrize("line,start,end,expected", [ + (Part(text=""), 0, None, Part(text="")), + (Part(text="a"), 0, None, Part(text="a")), + (Part(text="ab"), 0, 1, Part(text="a")), + (Part(text="abc"), 0, -1, Part(text="ab")), + (Part(text="ab"), 1, None, Part(text="b", start=1)), +]) +def test_line_substring(line, start, end, expected): + assert line.substring(rel_start=start, rel_end=end) == expected From 5277593bdbd50f2dee361087b26c0a0580e6ac28 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 10 Nov 2020 12:33:49 +0100 Subject: [PATCH 02/21] Fix some special cases --- .../flexible_character_accuracy.py | 37 ++++++-- .../tests/test_flexible_character_accuracy.py | 86 ++++++++++++------- 2 files changed, 83 insertions(+), 40 deletions(-) diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 3e4bb93..2b9a56f 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -117,6 +117,7 @@ def match_longest_gt_lines(gt_lines: List["Part"], if best_match and best_gt: splitted = remove_or_split(best_gt, best_match.gt, gt_lines) if splitted: + # according to the paper the match is not put back, we deviate... gt_lines.append(best_match.gt) best_match = None if best_match and best_ocr: @@ -134,13 +135,12 @@ def match_gt_line(gt_line: "Part", Reference: contains steps 3 of the flexible character accuracy algorithm. TODO: Make penalty function configurable? - TODO: Add empty ocr line to avoid having nonesense one character alignments? :return: Match object and the matched ocr line. """ min_penalty = float('inf') best_match, best_ocr = None, None - for ocr_line in ocr_lines: + for ocr_line in [*ocr_lines]: match = match_lines(gt_line, ocr_line) penalty = calculate_penalty(gt_line, ocr_line, match, coef) if penalty < min_penalty: @@ -177,20 +177,42 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional["Match"]: Reference: see figure 2 in the paper. TODO: make distance function configurable? + TODO: rethink @lru_cache :return: Match object if one is found. """ min_length = min(gt_line.length, ocr_line.length) best_match = None + best_i, best_j = 0, 0 if min_length == 0: return best_match length_diff = gt_line.length - ocr_line.length min_edit_dist = float('inf') - # TODO: handle deletes and replacements by extending the length. - for i in range(0, max(1, length_diff + 1)): - for j in range(0, max(1, -1 * length_diff + 1)): - match = distance(gt_line.substring(rel_start=i, rel_end=i + min_length), - ocr_line.substring(rel_start=j, rel_end=j + min_length)) + + gt_parts = [(i, gt_line.substring(rel_start=i, rel_end=i + min_length)) + for i in range(0, max(1, length_diff + 1))] + ocr_parts = [(j, ocr_line.substring(rel_start=j, rel_end=j + min_length)) + for j in range(0, max(1, -1 * length_diff + 1))] + + # add full line and empty line match + gt_parts = [*gt_parts, (0, gt_line), (0, gt_line)] + ocr_parts = [*ocr_parts, (0, ocr_line), + (0, Part(text="", line=gt_line.line, start=gt_line.start))] + + for i, gt_part in gt_parts: + for j, ocr_part in ocr_parts: + match = distance(gt_part, ocr_part) + edit_dist = score_edit_distance(match) + if edit_dist < min_edit_dist: + min_edit_dist = edit_dist + best_match = match + best_i, best_j = i, j + if best_match and (best_match.dist.delete or best_match.dist.replace): + part_length = best_match.gt.length + additional_length = best_match.dist.delete + best_match.dist.replace + for k in range(part_length + 1, part_length + additional_length + 1): + match = distance(gt_line.substring(rel_start=best_i, rel_end=best_i + k), + ocr_line.substring(rel_start=best_j, rel_end=best_j + k)) edit_dist = score_edit_distance(match) if edit_dist < min_edit_dist: min_edit_dist = edit_dist @@ -205,6 +227,7 @@ def distance(gt: "Part", ocr: "Part") -> "Match": Using the already available `editops()` function with the Levenshtein distance. TODO: replace with @cache annotation in Python 3.9 + TODO: rethink @lru_cache :return: Match object containing the lines and the editing operations. """ diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index 6393e2f..0126696 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -33,6 +33,7 @@ ] EXTENDED_CASES = [ + # See figure 4 in 10.1016/j.patrec.2020.02.003 # A: No errors ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), (0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), @@ -70,14 +71,18 @@ EDIT_ARGS = "gt,ocr,expected_dist" SIMPLE_EDITS = [ - (Part(text="a").substring(), Part(text="a"), Distance(match=1)), - (Part(text="a").substring(), Part(text="b"), Distance(replace=1)), - (Part(text="abcd").substring(), Part(text="beed"), + (Part(text="a"), Part(text="a"), Distance(match=1)), + (Part(text="aaa"), Part(text="aaa"), Distance(match=3)), + (Part(text="abcd"), Part(text="beed"), Distance(match=2, replace=1, insert=1, delete=1)), ] def extended_case_to_text(gt, ocr): + """Generate sentence from reading order encoding. + + See figure 4 in 10.1016/j.patrec.2020.02.003 + """ sentence = ("Eight", "happy", "frogs", "scuba", "dived", "Jenny", "chick", "flaps", "white", "wings", "", "\n") @@ -93,38 +98,45 @@ def test_flexible_character_accuracy_simple(gt, ocr, first_line_score, all_line_ assert score == pytest.approx(all_line_score) -@pytest.mark.xfail -@pytest.mark.parametrize("ocr", [ - "1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"\nAlberto\nEmstein", - "1 hav\nnospecial\ntalents. Alberto\nI am one Emstein\npassionate\ncuriousity.\"", - "Alberto\nEmstein\n1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"" +@pytest.mark.parametrize("config,ocr", [ + ("Config I", + "1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"\nAlberto\nEmstein" + ), + ("Config II", + "1 hav\nnospecial\ntalents. Alberto\nI am one Emstein\npassionate\ncuriousity.\"" + ), + ("Config III", + "Alberto\nEmstein\n1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"" + ), ]) -def test_flexible_character_accuracy(ocr): - """Tests from figure 3 in the paper. - - TODO: We have a 2 percent deviation from the original because of redistributed - one character alignments (e.g. the y-insert replaces the y-delete). - """ - gt = """"I have -no special -talent. -I am only -passionately -curious." -Albert -Einstein -""" - replacements = 3 - inserts = 5 - deletes = 7 +def test_flexible_character_accuracy(config, ocr): + """Tests from figure 3 in the paper.""" + gt = "\"I have\nno special\ntalent." \ + "\nI am only\npassionately\ncurious.\"" \ + "\nAlbert\nEinstein" + replacements, inserts, deletes = 3, 5, 7 chars = len(gt) - gt.count("\n") - assert replacements + inserts + deletes == 15 - edits = Distance(match=chars - deletes - replacements, replace=replacements, - insert=inserts, delete=deletes) - expected = character_accuracy(edits) - assert expected == pytest.approx(0.779, abs=0.0005) + assert chars == 68 + + # We consider whitespace as error and in Config II two additional + # whitespaces have been introduced. One will be counted as insert. + # The other whitespace will be counted as replacement, + # additionally reducing the number of deletes. + if config == "Config II": + inserts += 1 + replacements += 1 + deletes -= 1 + + expected_dist = Distance(match=chars - deletes - replacements, replace=replacements, + insert=inserts, delete=deletes) + expected_score = character_accuracy(expected_dist) + result, matches = flexible_character_accuracy(gt, ocr) - assert result == pytest.approx(expected, abs=0.0005) + agg = reduce(lambda acc, match: acc + Counter(match.dist._asdict()), + matches, Counter()) + dist = Distance(**agg) + assert dist == expected_dist + assert result == pytest.approx(expected_score, abs=0.0005) @pytest.mark.parametrize(CASE_ARGS, EXTENDED_CASES) @@ -191,9 +203,15 @@ def test_remove_or_split(original, match, expected_lines): @pytest.mark.parametrize(EDIT_ARGS, [ *SIMPLE_EDITS, + (Part(text="a"), Part(text="b"), Distance(delete=1)), + (Part(text="aaa"), Part(text="bbb"), Distance(delete=3)), (Part(text="aaabbbaaa"), Part(text="bbb"), Distance(match=3)), (Part(text="bbb"), Part(text="aaabbbaaa"), Distance(match=3)), - (Part(text=""), Part(text=""), None) + (Part(text=""), Part(text=""), None), + (Part(text="abcd"), Part(text="acd"), Distance(match=3, delete=1)), + (Part(text="abc"), Part(text="abdc"), Distance(match=3, insert=1)), + (Part(text="aaabbbaaaddd"), Part(text="aaabcbaaa"), Distance(match=8, replace=1)), + (Part(text="aaabbbccc"), Part(text="aaabbbdddccc"), Distance(match=9, insert=3)), ]) def test_match_lines(gt, ocr, expected_dist): match = match_lines(gt, ocr) @@ -210,6 +228,8 @@ def test_match_lines(gt, ocr, expected_dist): (Part(text="").substring(), Part(text=""), Distance()), (Part(text="ab").substring(), Part("a"), Distance(match=1, delete=1)), (Part(text="a").substring(), Part("ab"), Distance(match=1, insert=1)), + (Part(text="a"), Part(text="b"), Distance(replace=1)), + (Part(text="aaa"), Part(text="bbb"), Distance(replace=3)), ]) def test_distance(gt, ocr, expected_dist): match = distance(gt, ocr) From 2a215a1062126de689e4a639e87e77cbd0189dbf Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 10 Nov 2020 14:26:31 +0100 Subject: [PATCH 03/21] Reformat using black --- .../flexible_character_accuracy.py | 117 ++++--- .../tests/test_flexible_character_accuracy.py | 327 +++++++++++------- 2 files changed, 273 insertions(+), 171 deletions(-) diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 2b9a56f..e81ef54 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -29,20 +29,16 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List["Match"] :return: Score between 0 and 1 and match objects. """ - best_score = -float('inf') + best_score = -float("inf") best_matches = [] # TODO: this should be configurable - combinations = product(range(15, 31, 5), - range(0, 24, 3), - range(0, 4, 1), - range(0, 6, 1)) + combinations = product( + range(15, 31, 5), range(0, 24, 3), range(0, 4, 1), range(0, 6, 1) + ) # TODO: place to parallelize the algorithm for (edit_dist, length_diff, offset, length) in combinations: coef = Coefficients( - edit_dist=edit_dist, - length_diff=length_diff, - offset=offset, - length=length + edit_dist=edit_dist, length_diff=length_diff, offset=offset, length=length ) # Steps 1 - 6 of the flexible character accuracy algorithm. matches = match_with_coefficients(gt, ocr, coef) @@ -79,17 +75,21 @@ def match_with_coefficients(gt: str, ocr: str, coef: "Coefficients") -> List["Ma # Step 6 of the flexible character accuracy algorithm. # remaining lines are considered as deletes and inserts - deletes = [distance(line, Part(text="", line=line.line, start=line.start)) - for line in gt_lines] - inserts = [distance(Part(text="", line=line.line, start=line.start), line) - for line in ocr_lines] + deletes = [ + distance(line, Part(text="", line=line.line, start=line.start)) + for line in gt_lines + ] + inserts = [ + distance(Part(text="", line=line.line, start=line.start), line) + for line in ocr_lines + ] return [*matches, *deletes, *inserts] -def match_longest_gt_lines(gt_lines: List["Part"], - ocr_lines: List["Part"], - coef: "Coefficients") -> Optional["Match"]: +def match_longest_gt_lines( + gt_lines: List["Part"], ocr_lines: List["Part"], coef: "Coefficients" +) -> Optional["Match"]: """Find the best match for the longest line(s) in ground truth. The longest lines in ground truth are matched against lines in ocr to find the @@ -99,7 +99,7 @@ def match_longest_gt_lines(gt_lines: List["Part"], :return: Possible match object. """ - best_score, best_match, best_gt, best_ocr = -float('inf'), None, None, None + best_score, best_match, best_gt, best_ocr = -float("inf"), None, None, None if not ocr_lines: return best_match @@ -126,10 +126,9 @@ def match_longest_gt_lines(gt_lines: List["Part"], return best_match -def match_gt_line(gt_line: "Part", - ocr_lines: List["Part"], - coef: "Coefficients") -> Tuple[Optional["Match"], - Optional["Part"]]: +def match_gt_line( + gt_line: "Part", ocr_lines: List["Part"], coef: "Coefficients" +) -> Tuple[Optional["Match"], Optional["Part"]]: """Match the given ground truth line against all the lines in ocr. Reference: contains steps 3 of the flexible character accuracy algorithm. @@ -138,19 +137,18 @@ def match_gt_line(gt_line: "Part", :return: Match object and the matched ocr line. """ - min_penalty = float('inf') + min_penalty = float("inf") best_match, best_ocr = None, None for ocr_line in [*ocr_lines]: match = match_lines(gt_line, ocr_line) - penalty = calculate_penalty(gt_line, ocr_line, match, coef) - if penalty < min_penalty: - min_penalty, best_match, best_ocr = penalty, match, ocr_line + if match: + penalty = calculate_penalty(gt_line, ocr_line, match, coef) + if penalty < min_penalty: + min_penalty, best_match, best_ocr = penalty, match, ocr_line return best_match, best_ocr -def remove_or_split(original: "Part", - match: "Part", - lines: List["Part"]) -> bool: +def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> bool: """Removes the matched line or splits it into parts. Reference: contains step 4 of the flexible character accuracy algorithm. @@ -187,17 +185,24 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional["Match"]: if min_length == 0: return best_match length_diff = gt_line.length - ocr_line.length - min_edit_dist = float('inf') + min_edit_dist = float("inf") - gt_parts = [(i, gt_line.substring(rel_start=i, rel_end=i + min_length)) - for i in range(0, max(1, length_diff + 1))] - ocr_parts = [(j, ocr_line.substring(rel_start=j, rel_end=j + min_length)) - for j in range(0, max(1, -1 * length_diff + 1))] + gt_parts = [ + (i, gt_line.substring(rel_start=i, rel_end=i + min_length)) + for i in range(0, max(1, length_diff + 1)) + ] + ocr_parts = [ + (j, ocr_line.substring(rel_start=j, rel_end=j + min_length)) + for j in range(0, max(1, -1 * length_diff + 1)) + ] # add full line and empty line match gt_parts = [*gt_parts, (0, gt_line), (0, gt_line)] - ocr_parts = [*ocr_parts, (0, ocr_line), - (0, Part(text="", line=gt_line.line, start=gt_line.start))] + ocr_parts = [ + *ocr_parts, + (0, ocr_line), + (0, Part(text="", line=gt_line.line, start=gt_line.start)), + ] for i, gt_part in gt_parts: for j, ocr_part in ocr_parts: @@ -211,8 +216,10 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional["Match"]: part_length = best_match.gt.length additional_length = best_match.dist.delete + best_match.dist.replace for k in range(part_length + 1, part_length + additional_length + 1): - match = distance(gt_line.substring(rel_start=best_i, rel_end=best_i + k), - ocr_line.substring(rel_start=best_j, rel_end=best_j + k)) + match = distance( + gt_line.substring(rel_start=best_i, rel_end=best_i + k), + ocr_line.substring(rel_start=best_j, rel_end=best_j + k), + ) edit_dist = score_edit_distance(match) if edit_dist < min_edit_dist: min_edit_dist = edit_dist @@ -247,8 +254,9 @@ def score_edit_distance(match: "Match") -> int: return match.dist.delete + match.dist.insert + 2 * match.dist.replace -def calculate_penalty(gt: "Part", ocr: "Part", match: "Match", - coef: "Coefficients") -> float: +def calculate_penalty( + gt: "Part", ocr: "Part", match: "Match", coef: "Coefficients" +) -> float: """Calculate the penalty for a given match. For details and discussion see Section 3 in doi:10.1016/j.patrec.2020.02.003. @@ -262,10 +270,12 @@ def calculate_penalty(gt: "Part", ocr: "Part", match: "Match", if length_diff > 1: substring_pos = max(match.gt.start - gt.start, match.ocr.start - ocr.start) offset = length_diff / 2 - abs(substring_pos - length_diff / 2) - return (min_edit_dist * coef.edit_dist - + length_diff * coef.length_diff - + offset * coef.offset - - substring_length * coef.length) + return ( + min_edit_dist * coef.edit_dist + + length_diff * coef.length_diff + + offset * coef.offset + - substring_length * coef.length + ) def character_accuracy_for_matches(matches: List["Match"]) -> float: @@ -274,8 +284,9 @@ def character_accuracy_for_matches(matches: List["Match"]) -> float: See other `character_accuracy` for details. """ - agg: Counter = reduce(lambda acc, match: acc + Counter(match.dist._asdict()), - matches, Counter()) + agg: Counter = reduce( + lambda acc, match: acc + Counter(match.dist._asdict()), matches, Counter() + ) score = character_accuracy(Distance(**agg)) return score @@ -299,9 +310,9 @@ def character_accuracy(edits: "Distance") -> float: chars = edits.match + edits.replace + edits.delete if not chars and not errors: # comparison of empty strings is considered a full match - score = 1 + score = 1.0 else: - score = 1 - errors / chars + score = 1.0 - errors / chars return score @@ -315,9 +326,11 @@ def initialize_lines(text: str) -> List["Part"]: :param text: Text to split into lines. :return: List of sorted line objects. """ - lines = [Part(text=line, line=i, start=0) - for i, line in enumerate(text.splitlines()) - if len(line) > 0] + lines = [ + Part(text=line, line=i, start=0) + for i, line in enumerate(text.splitlines()) + if len(line) > 0 + ] lines.sort(key=lambda x: x.length, reverse=True) return lines @@ -348,6 +361,7 @@ class Part(NamedTuple): This data object is maintained to be able to reproduce the original text. """ + text: str = "" line: int = 0 start: int = 0 @@ -392,6 +406,7 @@ def substring(self, rel_start: int = 0, rel_end: int = None) -> "Part": class Distance(NamedTuple): """Represent distance between two sequences.""" + match: int = 0 replace: int = 0 delete: int = 0 @@ -400,6 +415,7 @@ class Distance(NamedTuple): class Match(NamedTuple): """Represent a calculated match between ground truth and the ocr result.""" + gt: "Part" ocr: "Part" dist: "Distance" @@ -411,6 +427,7 @@ class Coefficients(NamedTuple): See Section 3 in doi:10.1016/j.patrec.2020.02.003 """ + edit_dist: int = 25 length_diff: int = 20 offset: int = 1 diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index 0126696..dfcb1f7 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -35,37 +35,31 @@ EXTENDED_CASES = [ # See figure 4 in 10.1016/j.patrec.2020.02.003 # A: No errors - ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), - (0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), - 1, 1), + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), (0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), 1, 1), # B: Different ordering of text blocks - ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), - (5, 6, 7, 8, 9, 11, 0, 1, 2, 3, 4), - 1, 1), + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), (5, 6, 7, 8, 9, 11, 0, 1, 2, 3, 4), 1, 1), # C: Merge across columns - ((0, 1, 2, 11, 3, 4, 11, 5, 6, 7, 11, 8, 9), - (0, 1, 2, 5, 6, 7, 11, 3, 4, 8, 9), - 1, 0.964), + ( + (0, 1, 2, 11, 3, 4, 11, 5, 6, 7, 11, 8, 9), + (0, 1, 2, 5, 6, 7, 11, 3, 4, 8, 9), + 1, + 0.964, + ), # D: Over-segmentation - ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), - (0, 1, 2, 11, 5, 6, 7, 11, 3, 4, 11, 8, 9), - 1, 0.966), + ( + (0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), + (0, 1, 2, 11, 5, 6, 7, 11, 3, 4, 11, 8, 9), + 1, + 0.966, + ), # E: Part missing - ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), - (0, 1, 2, 3, 4), - 1, 0.50), + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), (0, 1, 2, 3, 4), 1, 0.50), # E.2: Part missing - ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), - (5, 6, 7, 8, 9), - 1, 0.50), + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), (5, 6, 7, 8, 9), 1, 0.50), # F: All missing - ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), - (), - 1, 0), + ((0, 1, 2, 3, 4, 11, 5, 6, 7, 8, 9), (), 1, 0), # G: Added parts - ((0, 1, 2, 3, 4), - (0, 1, 2, 3, 4, 11, 5, 6), - 1, 0.621), + ((0, 1, 2, 3, 4), (0, 1, 2, 3, 4, 11, 5, 6), 1, 0.621), ] EDIT_ARGS = "gt,ocr,expected_dist" @@ -73,8 +67,11 @@ SIMPLE_EDITS = [ (Part(text="a"), Part(text="a"), Distance(match=1)), (Part(text="aaa"), Part(text="aaa"), Distance(match=3)), - (Part(text="abcd"), Part(text="beed"), - Distance(match=2, replace=1, insert=1, delete=1)), + ( + Part(text="abcd"), + Part(text="beed"), + Distance(match=2, replace=1, insert=1, delete=1), + ), ] @@ -83,9 +80,20 @@ def extended_case_to_text(gt, ocr): See figure 4 in 10.1016/j.patrec.2020.02.003 """ - sentence = ("Eight", "happy", "frogs", "scuba", "dived", - "Jenny", "chick", "flaps", "white", "wings", - "", "\n") + sentence = ( + "Eight", + "happy", + "frogs", + "scuba", + "dived", + "Jenny", + "chick", + "flaps", + "white", + "wings", + "", + "\n", + ) gt_sentence = " ".join(sentence[i] for i in gt).replace(" \n ", "\n") ocr_sentence = " ".join(sentence[i] for i in ocr).replace(" \n ", "\n") @@ -98,22 +106,35 @@ def test_flexible_character_accuracy_simple(gt, ocr, first_line_score, all_line_ assert score == pytest.approx(all_line_score) -@pytest.mark.parametrize("config,ocr", [ - ("Config I", - "1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"\nAlberto\nEmstein" - ), - ("Config II", - "1 hav\nnospecial\ntalents. Alberto\nI am one Emstein\npassionate\ncuriousity.\"" - ), - ("Config III", - "Alberto\nEmstein\n1 hav\nnospecial\ntalents.\nI am one\npassionate\ncuriousity.\"" - ), -]) +@pytest.mark.parametrize( + "config,ocr", + [ + ( + "Config I", + "1 hav\nnospecial\ntalents.\n" + 'I am one\npassionate\ncuriousity."\n' + "Alberto\nEmstein", + ), + ( + "Config II", + '1 hav\nnospecial\ntalents. Alberto\n' + 'I am one Emstein\npassionate\ncuriousity."', + ), + ( + "Config III", + 'Alberto\nEmstein\n' + '1 hav\nnospecial\ntalents.\n' + 'I am one\npassionate\ncuriousity."', + ), + ], +) def test_flexible_character_accuracy(config, ocr): """Tests from figure 3 in the paper.""" - gt = "\"I have\nno special\ntalent." \ - "\nI am only\npassionately\ncurious.\"" \ - "\nAlbert\nEinstein" + gt = ( + '"I have\nno special\ntalent.\n' + 'I am only\npassionately\ncurious."\n' + "Albert\nEinstein" + ) replacements, inserts, deletes = 3, 5, 7 chars = len(gt) - gt.count("\n") assert chars == 68 @@ -127,21 +148,27 @@ def test_flexible_character_accuracy(config, ocr): replacements += 1 deletes -= 1 - expected_dist = Distance(match=chars - deletes - replacements, replace=replacements, - insert=inserts, delete=deletes) + expected_dist = Distance( + match=chars - deletes - replacements, + replace=replacements, + insert=inserts, + delete=deletes, + ) expected_score = character_accuracy(expected_dist) result, matches = flexible_character_accuracy(gt, ocr) - agg = reduce(lambda acc, match: acc + Counter(match.dist._asdict()), - matches, Counter()) + agg = reduce( + lambda acc, match: acc + Counter(match.dist._asdict()), matches, Counter() + ) dist = Distance(**agg) assert dist == expected_dist assert result == pytest.approx(expected_score, abs=0.0005) @pytest.mark.parametrize(CASE_ARGS, EXTENDED_CASES) -def test_flexible_character_accuracy_extended(gt, ocr, first_line_score, - all_line_score): +def test_flexible_character_accuracy_extended( + gt, ocr, first_line_score, all_line_score +): """Tests from figure 4 in the paper.""" gt_sentence, ocr_sentence = extended_case_to_text(gt, ocr) result, _ = flexible_character_accuracy(gt_sentence, ocr_sentence) @@ -170,10 +197,13 @@ def test_match_longest_gt_lines(gt, ocr, first_line_score, all_line_score): assert score == pytest.approx(first_line_score) -@pytest.mark.parametrize(CASE_ARGS, [ - *SIMPLE_CASES, - ("accc", "a\nbb\nccc", 1.0, 1.0), -]) +@pytest.mark.parametrize( + CASE_ARGS, + [ + *SIMPLE_CASES, + ("accc", "a\nbb\nccc", 1.0, 1.0), + ], +) def test_match_gt_line(gt, ocr, first_line_score, all_line_score): coef = Coefficients() gt_lines = initialize_lines(gt) @@ -185,15 +215,22 @@ def test_match_gt_line(gt, ocr, first_line_score, all_line_score): assert score == pytest.approx(first_line_score) -@pytest.mark.parametrize("original,match,expected_lines", [ - (Part(), Part(), []), - (Part(text="abc"), Part(), [Part(text="abc")]), - (Part(text="abc"), Part("d"), [Part(text="bc", start=1)]), - (Part(text="abc"), Part("a", start=100), [Part(text="abc")]), - (Part(text="abc"), Part("a"), [Part(text="bc", start=1)]), - (Part(text="abc"), Part("b", start=1), [Part(text="a"), Part(text="c", start=2)]), - (Part(text="abc"), Part("c", start=2), [Part(text="ab")]), -]) +@pytest.mark.parametrize( + "original,match,expected_lines", + [ + (Part(), Part(), []), + (Part(text="abc"), Part(), [Part(text="abc")]), + (Part(text="abc"), Part("d"), [Part(text="bc", start=1)]), + (Part(text="abc"), Part("a", start=100), [Part(text="abc")]), + (Part(text="abc"), Part("a"), [Part(text="bc", start=1)]), + ( + Part(text="abc"), + Part("b", start=1), + [Part(text="a"), Part(text="c", start=2)], + ), + (Part(text="abc"), Part("c", start=2), [Part(text="ab")]), + ], +) def test_remove_or_split(original, match, expected_lines): lines = [original] splitted = remove_or_split(original, match, lines) @@ -201,18 +238,29 @@ def test_remove_or_split(original, match, expected_lines): assert lines == expected_lines -@pytest.mark.parametrize(EDIT_ARGS, [ - *SIMPLE_EDITS, - (Part(text="a"), Part(text="b"), Distance(delete=1)), - (Part(text="aaa"), Part(text="bbb"), Distance(delete=3)), - (Part(text="aaabbbaaa"), Part(text="bbb"), Distance(match=3)), - (Part(text="bbb"), Part(text="aaabbbaaa"), Distance(match=3)), - (Part(text=""), Part(text=""), None), - (Part(text="abcd"), Part(text="acd"), Distance(match=3, delete=1)), - (Part(text="abc"), Part(text="abdc"), Distance(match=3, insert=1)), - (Part(text="aaabbbaaaddd"), Part(text="aaabcbaaa"), Distance(match=8, replace=1)), - (Part(text="aaabbbccc"), Part(text="aaabbbdddccc"), Distance(match=9, insert=3)), -]) +@pytest.mark.parametrize( + EDIT_ARGS, + [ + *SIMPLE_EDITS, + (Part(text="a"), Part(text="b"), Distance(delete=1)), + (Part(text="aaa"), Part(text="bbb"), Distance(delete=3)), + (Part(text="aaabbbaaa"), Part(text="bbb"), Distance(match=3)), + (Part(text="bbb"), Part(text="aaabbbaaa"), Distance(match=3)), + (Part(text=""), Part(text=""), None), + (Part(text="abcd"), Part(text="acd"), Distance(match=3, delete=1)), + (Part(text="abc"), Part(text="abdc"), Distance(match=3, insert=1)), + ( + Part(text="aaabbbaaaddd"), + Part(text="aaabcbaaa"), + Distance(match=8, replace=1), + ), + ( + Part(text="aaabbbccc"), + Part(text="aaabbbdddccc"), + Distance(match=9, insert=3), + ), + ], +) def test_match_lines(gt, ocr, expected_dist): match = match_lines(gt, ocr) if not expected_dist: @@ -223,14 +271,17 @@ def test_match_lines(gt, ocr, expected_dist): assert match.dist == expected_dist -@pytest.mark.parametrize(EDIT_ARGS, [ - *SIMPLE_EDITS, - (Part(text="").substring(), Part(text=""), Distance()), - (Part(text="ab").substring(), Part("a"), Distance(match=1, delete=1)), - (Part(text="a").substring(), Part("ab"), Distance(match=1, insert=1)), - (Part(text="a"), Part(text="b"), Distance(replace=1)), - (Part(text="aaa"), Part(text="bbb"), Distance(replace=3)), -]) +@pytest.mark.parametrize( + EDIT_ARGS, + [ + *SIMPLE_EDITS, + (Part(text="").substring(), Part(text=""), Distance()), + (Part(text="ab").substring(), Part("a"), Distance(match=1, delete=1)), + (Part(text="a").substring(), Part("ab"), Distance(match=1, insert=1)), + (Part(text="a"), Part(text="b"), Distance(replace=1)), + (Part(text="aaa"), Part(text="bbb"), Distance(replace=3)), + ], +) def test_distance(gt, ocr, expected_dist): match = distance(gt, ocr) assert match.gt == gt @@ -238,46 +289,77 @@ def test_distance(gt, ocr, expected_dist): assert match.dist == expected_dist -@pytest.mark.parametrize("matches,expected_dist", [ - ([], 1), - ([Match(gt=Part(text=""), ocr=Part(text=""), dist=Distance(), ops=[])], 1), - ([Match(gt=Part(text="abee"), ocr=Part("ac"), - dist=Distance(match=1, replace=1, delete=2), ops=[]), - Match(gt=Part(text="cd"), ocr=Part("ceff"), - dist=Distance(match=1, replace=1, insert=2), ops=[])], - 1 - 6 / 6), -]) +@pytest.mark.parametrize( + "matches,expected_dist", + [ + ([], 1), + ([Match(gt=Part(text=""), ocr=Part(text=""), dist=Distance(), ops=[])], 1), + ( + [ + Match( + gt=Part(text="abee"), + ocr=Part("ac"), + dist=Distance(match=1, replace=1, delete=2), + ops=[], + ), + Match( + gt=Part(text="cd"), + ocr=Part("ceff"), + dist=Distance(match=1, replace=1, insert=2), + ops=[], + ), + ], + 1 - 6 / 6, + ), + ], +) def test_character_accuracy_matches(matches, expected_dist): assert character_accuracy_for_matches(matches) == pytest.approx(expected_dist) -@pytest.mark.parametrize("dist,expected_dist", [ - (Distance(), 1), - (Distance(match=1), 1), - (Distance(replace=1), 0), - (Distance(match=1, insert=1), 0), - (Distance(match=1, insert=2), 1 - 2 / 1), - (Distance(match=2, insert=1), 0.5), - (Distance(match=1, delete=1), 0.5), -]) +@pytest.mark.parametrize( + "dist,expected_dist", + [ + (Distance(), 1), + (Distance(match=1), 1), + (Distance(replace=1), 0), + (Distance(match=1, insert=1), 0), + (Distance(match=1, insert=2), 1 - 2 / 1), + (Distance(match=2, insert=1), 0.5), + (Distance(match=1, delete=1), 0.5), + ], +) def test_character_accuracy_dist(dist, expected_dist): assert character_accuracy(dist) == pytest.approx(expected_dist) -@pytest.mark.parametrize("line,subline,expected_rest", [ - (Part(), Part(), []), - (Part("aaa bbb"), Part("aaa bbb"), []), - (Part("aaa bbb"), Part("aaa"), [Part(" bbb", start=3)]), - (Part("aaa bbb"), Part("bbb", start=4), [Part("aaa ")]), - (Part("aaa bbb", start=3), Part("aaa", start=3), [Part(" bbb", start=6)]), - (Part("aaa bbb", start=3), Part("bbb", start=7), [Part("aaa ", start=3)]), - (Part("aaa bbb ccc"), Part("bbb", start=4), [Part("aaa "), Part(" ccc", start=7)]), - (Part("aaa bbb ccc", start=3), Part("bbb", start=7), - [Part("aaa ", start=3), Part(" ccc", start=10)]), - (Part("aaa bbb"), Part(" ", start=3), [Part("aaa"), Part("bbb", start=4)]), - (Part("aaa bbb", start=3), Part(" ", start=6), - [Part("aaa", start=3), Part("bbb", start=7)]), -]) +@pytest.mark.parametrize( + "line,subline,expected_rest", + [ + (Part(), Part(), []), + (Part("aaa bbb"), Part("aaa bbb"), []), + (Part("aaa bbb"), Part("aaa"), [Part(" bbb", start=3)]), + (Part("aaa bbb"), Part("bbb", start=4), [Part("aaa ")]), + (Part("aaa bbb", start=3), Part("aaa", start=3), [Part(" bbb", start=6)]), + (Part("aaa bbb", start=3), Part("bbb", start=7), [Part("aaa ", start=3)]), + ( + Part("aaa bbb ccc"), + Part("bbb", start=4), + [Part("aaa "), Part(" ccc", start=7)], + ), + ( + Part("aaa bbb ccc", start=3), + Part("bbb", start=7), + [Part("aaa ", start=3), Part(" ccc", start=10)], + ), + (Part("aaa bbb"), Part(" ", start=3), [Part("aaa"), Part("bbb", start=4)]), + ( + Part("aaa bbb", start=3), + Part(" ", start=6), + [Part("aaa", start=3), Part("bbb", start=7)], + ), + ], +) def test_split_line(line, subline, expected_rest): rest = line.split(subline) assert len(rest) == len(expected_rest) @@ -300,12 +382,15 @@ def test_combine_lines(): assert False -@pytest.mark.parametrize("line,start,end,expected", [ - (Part(text=""), 0, None, Part(text="")), - (Part(text="a"), 0, None, Part(text="a")), - (Part(text="ab"), 0, 1, Part(text="a")), - (Part(text="abc"), 0, -1, Part(text="ab")), - (Part(text="ab"), 1, None, Part(text="b", start=1)), -]) +@pytest.mark.parametrize( + "line,start,end,expected", + [ + (Part(text=""), 0, None, Part(text="")), + (Part(text="a"), 0, None, Part(text="a")), + (Part(text="ab"), 0, 1, Part(text="a")), + (Part(text="abc"), 0, -1, Part(text="ab")), + (Part(text="ab"), 1, None, Part(text="b", start=1)), + ], +) def test_line_substring(line, start, end, expected): assert line.substring(rel_start=start, rel_end=end) == expected From 4a87adc2c76db67bdb566c749569fdca8e904ad2 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 10 Nov 2020 17:18:09 +0100 Subject: [PATCH 04/21] Implement version specific data structures As ocr-d continues the support for Python 3.5 until the end of this year version specific data structures have been implemented. When the support for Python 3.5 is dropped the extra file can easily be removed. --- .../flexible_character_accuracy.py | 91 +++++++------------ .../flexible_character_accuracy_ds.py | 48 ++++++++++ .../flexible_character_accuracy_ds_35.py | 76 ++++++++++++++++ 3 files changed, 157 insertions(+), 58 deletions(-) create mode 100644 qurator/dinglehopper/flexible_character_accuracy_ds.py create mode 100644 qurator/dinglehopper/flexible_character_accuracy_ds_35.py diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index e81ef54..7b29684 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -11,15 +11,31 @@ Note that we deviated from the original algorithm at some places. """ +import sys from collections import Counter from functools import lru_cache, reduce from itertools import product, takewhile -from typing import List, NamedTuple, Tuple, Optional +from typing import List, Tuple, Optional from . import editops +if sys.version_info.minor == 5: + from .flexible_character_accuracy_ds_35 import ( + PartVersionSpecific, + Match, + Distance, + Coefficients, + ) +else: + from .flexible_character_accuracy_ds import ( + PartVersionSpecific, + Match, + Distance, + Coefficients, + ) + -def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List["Match"]]: +def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]: """Calculate the flexible character accuracy. Reference: contains steps 1-7 of the flexible character accuracy algorithm. @@ -53,7 +69,7 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List["Match"] return best_score, best_matches -def match_with_coefficients(gt: str, ocr: str, coef: "Coefficients") -> List["Match"]: +def match_with_coefficients(gt: str, ocr: str, coef: Coefficients) -> List[Match]: """Match ground truth with ocr and considers a given set of coefficients. Reference: contains steps 1 - 6 of the flexible character accuracy algorithm. @@ -88,8 +104,8 @@ def match_with_coefficients(gt: str, ocr: str, coef: "Coefficients") -> List["Ma def match_longest_gt_lines( - gt_lines: List["Part"], ocr_lines: List["Part"], coef: "Coefficients" -) -> Optional["Match"]: + gt_lines: List["Part"], ocr_lines: List["Part"], coef: Coefficients +) -> Optional[Match]: """Find the best match for the longest line(s) in ground truth. The longest lines in ground truth are matched against lines in ocr to find the @@ -127,8 +143,8 @@ def match_longest_gt_lines( def match_gt_line( - gt_line: "Part", ocr_lines: List["Part"], coef: "Coefficients" -) -> Tuple[Optional["Match"], Optional["Part"]]: + gt_line: "Part", ocr_lines: List["Part"], coef: Coefficients +) -> Tuple[Optional[Match], Optional["Part"]]: """Match the given ground truth line against all the lines in ocr. Reference: contains steps 3 of the flexible character accuracy algorithm. @@ -166,7 +182,7 @@ def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> boo @lru_cache(maxsize=1000000) -def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional["Match"]: +def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: """Matches two lines searching for a local alignment. The shorter line is moved along the longer line @@ -228,7 +244,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional["Match"]: @lru_cache(maxsize=1000000) -def distance(gt: "Part", ocr: "Part") -> "Match": +def distance(gt: "Part", ocr: "Part") -> Match: """Calculate the editing distance between the two lines. Using the already available `editops()` function with the Levenshtein distance. @@ -244,7 +260,7 @@ def distance(gt: "Part", ocr: "Part") -> "Match": return Match(gt=gt, ocr=ocr, dist=Distance(**edits), ops=ops) -def score_edit_distance(match: "Match") -> int: +def score_edit_distance(match: Match) -> int: """Calculate edit distance for a match. Formula: $deletes + inserts + 2 * replacements$ @@ -254,9 +270,7 @@ def score_edit_distance(match: "Match") -> int: return match.dist.delete + match.dist.insert + 2 * match.dist.replace -def calculate_penalty( - gt: "Part", ocr: "Part", match: "Match", coef: "Coefficients" -) -> float: +def calculate_penalty(gt: "Part", ocr: "Part", match: Match, coef: Coefficients) -> float: """Calculate the penalty for a given match. For details and discussion see Section 3 in doi:10.1016/j.patrec.2020.02.003. @@ -278,21 +292,21 @@ def calculate_penalty( ) -def character_accuracy_for_matches(matches: List["Match"]) -> float: +def character_accuracy_for_matches(matches: List[Match]) -> float: """Character accuracy of a full text represented by a list of matches. See other `character_accuracy` for details. """ - agg: Counter = reduce( + agg = reduce( lambda acc, match: acc + Counter(match.dist._asdict()), matches, Counter() - ) + ) # type: Counter score = character_accuracy(Distance(**agg)) return score -def character_accuracy(edits: "Distance") -> float: +def character_accuracy(edits: Distance) -> float: """Character accuracy calculated by necessary edit operations. Edit operations are needed edits to transform one text into another. @@ -335,7 +349,7 @@ def initialize_lines(text: str) -> List["Part"]: return lines -def combine_lines(matches: List["Match"]) -> Tuple[str, str]: +def combine_lines(matches: List[Match]) -> Tuple[str, str]: """Combines the matches to aligned texts. TODO: just hacked, needs tests and refinement. Also missing insert/delete marking. @@ -356,16 +370,7 @@ def combine_lines(matches: List["Match"]) -> Tuple[str, str]: return gt, ocr -class Part(NamedTuple): - """Represent a line or part of a line. - - This data object is maintained to be able to reproduce the original text. - """ - - text: str = "" - line: int = 0 - start: int = 0 - +class Part(PartVersionSpecific): @property def end(self) -> int: return self.start + self.length @@ -402,33 +407,3 @@ def substring(self, rel_start: int = 0, rel_end: int = None) -> "Part": text = self.text[rel_start:rel_end] start = self.start + rel_start return Part(text=text, line=self.line, start=start) - - -class Distance(NamedTuple): - """Represent distance between two sequences.""" - - match: int = 0 - replace: int = 0 - delete: int = 0 - insert: int = 0 - - -class Match(NamedTuple): - """Represent a calculated match between ground truth and the ocr result.""" - - gt: "Part" - ocr: "Part" - dist: "Distance" - ops: List - - -class Coefficients(NamedTuple): - """Coefficients to calculate penalty for substrings. - - See Section 3 in doi:10.1016/j.patrec.2020.02.003 - """ - - edit_dist: int = 25 - length_diff: int = 20 - offset: int = 1 - length: int = 4 diff --git a/qurator/dinglehopper/flexible_character_accuracy_ds.py b/qurator/dinglehopper/flexible_character_accuracy_ds.py new file mode 100644 index 0000000..ac5595c --- /dev/null +++ b/qurator/dinglehopper/flexible_character_accuracy_ds.py @@ -0,0 +1,48 @@ +""" +Datastructures to be used with the Flexible Character Accuracy Algorithm + +Separated because of version compatibility issues with Python 3.5. +""" + +from typing import List, NamedTuple + + +class PartVersionSpecific(NamedTuple): + """Represent a line or part of a line. + + This data object is maintained to be able to reproduce the original text. + """ + + text: str = "" + line: int = 0 + start: int = 0 + + +class Distance(NamedTuple): + """Represent distance between two sequences.""" + + match: int = 0 + replace: int = 0 + delete: int = 0 + insert: int = 0 + + +class Match(NamedTuple): + """Represent a calculated match between ground truth and the ocr result.""" + + gt: "Part" + ocr: "Part" + dist: "Distance" + ops: List + + +class Coefficients(NamedTuple): + """Coefficients to calculate penalty for substrings. + + See Section 3 in doi:10.1016/j.patrec.2020.02.003 + """ + + edit_dist: int = 25 + length_diff: int = 20 + offset: int = 1 + length: int = 4 diff --git a/qurator/dinglehopper/flexible_character_accuracy_ds_35.py b/qurator/dinglehopper/flexible_character_accuracy_ds_35.py new file mode 100644 index 0000000..61b924e --- /dev/null +++ b/qurator/dinglehopper/flexible_character_accuracy_ds_35.py @@ -0,0 +1,76 @@ +""" +Datastructures to be used with the Flexible Character Accuracy Algorithm + +Separated because of version compatibility issues with Python 3.5. +""" + +from collections import namedtuple +from typing import Dict + + +class PartVersionSpecific: + def __init__(self, text: str = "", line: int = 0, start: int = 0): + self.text = text + self.line = line + self.start = start + + def __eq__(self, other): + return ( + self.line == other.line + and self.start == other.start + and self.text == other.text + ) + + def __hash__(self): + return hash(self.text) ^ hash(self.line) ^ hash(self.start) + + +class Distance: + def __init__( + self, match: int = 0, replace: int = 0, delete: int = 0, insert: int = 0 + ): + self.match = match + self.replace = replace + self.delete = delete + self.insert = insert + + def _asdict(self) -> Dict: + return { + "match": self.match, + "replace": self.replace, + "delete": self.delete, + "insert": self.insert, + } + + def __eq__(self, other): + return ( + self.match == other.match + and self.replace == other.replace + and self.delete == other.delete + and self.insert == other.insert + ) + + def __hash__(self): + return ( + hash(self.match) + ^ hash(self.replace) + ^ hash(self.delete) + ^ hash(self.insert) + ) + + +Match = namedtuple("Match", ["gt", "ocr", "dist", "ops"]) + + +class Coefficients: + def __init__( + self, + edit_dist: int = 25, + length_diff: int = 20, + offset: int = 1, + length: int = 4, + ): + self.edit_dist = edit_dist + self.length_diff = length_diff + self.offset = offset + self.length = length From 26fe98dde7a0d2166b7d2da513e8859eec5ae601 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Wed, 11 Nov 2020 11:13:24 +0100 Subject: [PATCH 05/21] Readd pytest.ini --- pytest.ini | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 pytest.ini diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..c56273f --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + integration: integration tests + serial From 9b76539936bc3989079a20cda269fe0b469affa1 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Wed, 11 Nov 2020 11:13:56 +0100 Subject: [PATCH 06/21] Fix numpy version conflict with ocrd_utils --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7bb53ac..61499dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ click jinja2 lxml uniseg -numpy +numpy<1.19.0 colorama MarkupSafe ocrd >= 2.20.1 From 53064bf8335cd7a0d62a662da2a9a44083aed959 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Wed, 11 Nov 2020 11:14:44 +0100 Subject: [PATCH 07/21] Include fca as parameter and add some tests --- README.md | 20 +++--- qurator/dinglehopper/__init__.py | 1 + qurator/dinglehopper/align.py | 5 +- qurator/dinglehopper/cli.py | 64 +++++++++++++------ .../flexible_character_accuracy.py | 32 ++++++---- qurator/dinglehopper/ocrd-tool.json | 7 +- qurator/dinglehopper/templates/report.html.j2 | 19 +++++- qurator/dinglehopper/templates/report.json.j2 | 13 ++-- .../tests/test_flexible_character_accuracy.py | 47 ++++++++++++-- .../tests/test_integ_cli_valid_json.py | 26 ++++++-- .../test_integ_flexible_character_accuracy.py | 50 +++++++++++++++ 11 files changed, 219 insertions(+), 65 deletions(-) create mode 100644 qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py diff --git a/README.md b/README.md index 6d82541..8cf074b 100644 --- a/README.md +++ b/README.md @@ -35,19 +35,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX] their text and falls back to plain text if no ALTO or PAGE is detected. The files GT and OCR are usually a ground truth document and the result of - an OCR software, but you may use dinglehopper to compare two OCR results. - In that case, use --no-metrics to disable the then meaningless metrics and - also change the color scheme from green/red to blue. + an OCR software, but you may use dinglehopper to compare two OCR results. In + that case, use --metrics='' to disable the then meaningless metrics and also + change the color scheme from green/red to blue. The comparison report will be written to $REPORT_PREFIX.{html,json}, where - $REPORT_PREFIX defaults to "report". The reports include the character - error rate (CER) and the word error rate (WER). + $REPORT_PREFIX defaults to "report". Depending on your configuration the + reports include the character error rate (CER), the word error rate (WER) + and the flexible character accuracy (FCA). + + The metrics can be chosen via a comma separated combination of their acronyms + like "--metrics=cer,wer,fca". By default, the text of PAGE files is extracted on 'region' level. You may use "--textequiv-level line" to extract from the level of TextLine tags. Options: - --metrics / --no-metrics Enable/disable metrics and green/red + --metrics Enable different metrics like cer, wer and fca. --textequiv-level LEVEL PAGE TextEquiv level to extract text from --progress Show progress bar --help Show this message and exit. @@ -80,12 +84,12 @@ The OCR-D processor has these parameters: | Parameter | Meaning | | ------------------------- | ------------------------------------------------------------------- | -| `-P metrics false` | Disable metrics and the green-red color scheme (default: enabled) | +| `-P metrics cer,wer` | Enable character error rate and word error rate (default) | | `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) | For example: ~~~ -ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false +ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer ~~~ Developer information diff --git a/qurator/dinglehopper/__init__.py b/qurator/dinglehopper/__init__.py index 8e58101..fd309dc 100644 --- a/qurator/dinglehopper/__init__.py +++ b/qurator/dinglehopper/__init__.py @@ -3,3 +3,4 @@ from .character_error_rate import * from .word_error_rate import * from .align import * +from .flexible_character_accuracy import flexible_character_accuracy, split_matches diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index c7e7733..ede75f4 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -8,11 +8,12 @@ def align(t1, t2): return seq_align(s1, s2) -def seq_align(s1, s2): +def seq_align(s1, s2, ops=None): """Align general sequences.""" s1 = list(s1) s2 = list(s2) - ops = seq_editops(s1, s2) + if not ops: + ops = seq_editops(s1, s2) i = 0 j = 0 diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 09c26f0..b717618 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -6,6 +6,7 @@ from uniseg.graphemecluster import grapheme_clusters from .character_error_rate import character_error_rate_n +from .flexible_character_accuracy import flexible_character_accuracy, split_matches from .word_error_rate import word_error_rate_n, words_normalized from .align import seq_align from .extracted_text import ExtractedText @@ -13,7 +14,7 @@ from .config import Config -def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none): +def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, ops=None): gtx = "" ocrx = "" @@ -53,7 +54,7 @@ def format_thing(t, css_classes=None, id_=None): g_pos = 0 o_pos = 0 - for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)): + for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, ops=ops)): css_classes = None gt_id = None ocr_id = None @@ -83,28 +84,43 @@ def format_thing(t, css_classes=None, id_=None): ) -def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"): +def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"): """Check OCR result against GT. - The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use - Click on a wrapper. + The @click decorators change the signature of the decorated functions, + so we keep this undecorated version and use Click on a wrapper. """ + cer, char_diff_report, n_characters = None, None, None + wer, word_diff_report, n_words = None, None, None + fca, fca_diff_report = None, None gt_text = extract(gt, textequiv_level=textequiv_level) ocr_text = extract(ocr, textequiv_level=textequiv_level) - cer, n_characters = character_error_rate_n(gt_text, ocr_text) - wer, n_words = word_error_rate_n(gt_text, ocr_text) - - char_diff_report = gen_diff_report( - gt_text, ocr_text, css_prefix="c", joiner="", none="·" - ) + if "cer" in metrics or not metrics: + cer, n_characters = character_error_rate_n(gt_text, ocr_text) + char_diff_report = gen_diff_report( + gt_text, ocr_text, css_prefix="c", joiner="", none="·" + ) - gt_words = words_normalized(gt_text) - ocr_words = words_normalized(ocr_text) - word_diff_report = gen_diff_report( - gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" - ) + if "wer" in metrics: + gt_words = words_normalized(gt_text) + ocr_words = words_normalized(ocr_text) + wer, n_words = word_error_rate_n(gt_text, ocr_text) + word_diff_report = gen_diff_report( + gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" + ) + if "fca" in metrics: + fca, fca_matches = flexible_character_accuracy(gt_text.text, ocr_text.text) + fca_gt_segments, fca_ocr_segments, ops = split_matches(fca_matches) + fca_diff_report = gen_diff_report( + fca_gt_segments, + fca_ocr_segments, + css_prefix="c", + joiner="", + none="·", + ops=ops, + ) def json_float(value): """Convert a float value to an JSON float. @@ -137,8 +153,10 @@ def json_float(value): n_characters=n_characters, wer=wer, n_words=n_words, + fca=fca, char_diff_report=char_diff_report, word_diff_report=word_diff_report, + fca_diff_report=fca_diff_report, metrics=metrics, ).dump(out_fn) @@ -148,7 +166,9 @@ def json_float(value): @click.argument("ocr", type=click.Path(exists=True)) @click.argument("report_prefix", type=click.Path(), default="report") @click.option( - "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red" + "--metrics", + default="cer,wer", + help="Enable different metrics like cer, wer and fca.", ) @click.option( "--textequiv-level", @@ -166,12 +186,16 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress): The files GT and OCR are usually a ground truth document and the result of an OCR software, but you may use dinglehopper to compare two OCR results. In - that case, use --no-metrics to disable the then meaningless metrics and also + that case, use --metrics='' to disable the then meaningless metrics and also change the color scheme from green/red to blue. The comparison report will be written to $REPORT_PREFIX.{html,json}, where - $REPORT_PREFIX defaults to "report". The reports include the character error - rate (CER) and the word error rate (WER). + $REPORT_PREFIX defaults to "report". Depending on your configuration the + reports include the character error rate (CER), the word error rate (WER) + and the flexible character accuracy (FCA). + + The metrics can be chosen via a comma separated combination of their acronyms + like "--metrics=cer,wer,fca". By default, the text of PAGE files is extracted on 'region' level. You may use "--textequiv-level line" to extract from the level of TextLine tags. diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 7b29684..7865dd1 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -270,7 +270,9 @@ def score_edit_distance(match: Match) -> int: return match.dist.delete + match.dist.insert + 2 * match.dist.replace -def calculate_penalty(gt: "Part", ocr: "Part", match: Match, coef: Coefficients) -> float: +def calculate_penalty( + gt: "Part", ocr: "Part", match: Match, coef: Coefficients +) -> float: """Calculate the penalty for a given match. For details and discussion see Section 3 in doi:10.1016/j.patrec.2020.02.003. @@ -325,6 +327,8 @@ def character_accuracy(edits: Distance) -> float: if not chars and not errors: # comparison of empty strings is considered a full match score = 1.0 + elif not chars: + score = -errors else: score = 1.0 - errors / chars return score @@ -349,25 +353,25 @@ def initialize_lines(text: str) -> List["Part"]: return lines -def combine_lines(matches: List[Match]) -> Tuple[str, str]: - """Combines the matches to aligned texts. - - TODO: just hacked, needs tests and refinement. Also missing insert/delete marking. +def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List]]: + """Extracts text segments and editing operations in separate lists. :param matches: List of match objects. - :return: the aligned ground truth and ocr as texts. + :return: List of ground truth segments, ocr segments and editing operations. """ - matches.sort(key=lambda x: x.gt.line + x.gt.start / 10000) + matches = sorted(matches, key=lambda x: x.gt.line + x.gt.start / 10000) line = 0 - gt, ocr = "", "" + gt, ocr, ops = [], [], [] for match in matches: if match.gt.line > line: - gt += "\n" - ocr += "\n" - line += 1 - gt += match.gt.text - ocr += match.ocr.text - return gt, ocr + gt.append("\n") + ocr.append("\n") + ops.append([]) + line = match.gt.line + gt.append(match.gt.text) + ocr.append(match.ocr.text) + ops.append(match.ops) + return gt, ocr, ops class Part(PartVersionSpecific): diff --git a/qurator/dinglehopper/ocrd-tool.json b/qurator/dinglehopper/ocrd-tool.json index 1e2b9b0..f8d480e 100644 --- a/qurator/dinglehopper/ocrd-tool.json +++ b/qurator/dinglehopper/ocrd-tool.json @@ -19,9 +19,10 @@ ], "parameters": { "metrics": { - "type": "boolean", - "default": true, - "description": "Enable/disable metrics and green/red" + "type": "string", + "enum": ["", "cer", "wer", "fca", "cer,wer", "cer,fca", "wer,fca", "cer,wer,fca"], + "default": "cer,wer", + "description": "Enable different metrics like cer, wer and fca." }, "textequiv_level": { "type": "string", diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 index 0c2f464..a194a5a 100644 --- a/qurator/dinglehopper/templates/report.html.j2 +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -40,16 +40,31 @@ {% if metrics %}

Metrics

-

CER: {{ cer|round(4) }}

-

WER: {{ wer|round(4) }}

+ {% if cer %} +

CER: {{ cer|round(4) }}

+ {% endif %} + {% if wer %} +

WER: {{ wer|round(4) }}

+ {% endif %} + {% if fca %} +

FCA: {{ fca|round(4) }}

+ {% endif %} {% endif %} +{% if char_diff_report %}

Character differences

{{ char_diff_report }} +{% endif %} +{% if word_diff_report %}

Word differences

{{ word_diff_report }} +{% endif %} +{% if fca_diff_report %} +

Flexible character accuracy differences

+{{ fca_diff_report }} +{% endif %} diff --git a/qurator/dinglehopper/templates/report.json.j2 b/qurator/dinglehopper/templates/report.json.j2 index 0e8af03..b59fbba 100644 --- a/qurator/dinglehopper/templates/report.json.j2 +++ b/qurator/dinglehopper/templates/report.json.j2 @@ -1,10 +1,11 @@ { - "gt": "{{ gt }}", - "ocr": "{{ ocr }}", {% if metrics %} - "cer": {{ cer|json_float }}, - "wer": {{ wer|json_float }}, + {% if cer %}"cer": {{ cer|json_float }},{% endif %} + {% if wer %}"wer": {{ wer|json_float }},{% endif %} + {% if fca %}"fca": {{ fca|json_float }},{% endif %} + {% if n_characters %}"n_characters": {{ n_characters }},{% endif %} + {% if n_words %}"n_words": {{ n_words }},{% endif %} {% endif %} - "n_characters": {{ n_characters }}, - "n_words": {{ n_words }} + "gt": "{{ gt }}", + "ocr": "{{ ocr }}" } diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index dfcb1f7..2f6d702 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -117,13 +117,13 @@ def test_flexible_character_accuracy_simple(gt, ocr, first_line_score, all_line_ ), ( "Config II", - '1 hav\nnospecial\ntalents. Alberto\n' + "1 hav\nnospecial\ntalents. Alberto\n" 'I am one Emstein\npassionate\ncuriousity."', ), ( "Config III", - 'Alberto\nEmstein\n' - '1 hav\nnospecial\ntalents.\n' + "Alberto\nEmstein\n" + "1 hav\nnospecial\ntalents.\n" 'I am one\npassionate\ncuriousity."', ), ], @@ -323,6 +323,8 @@ def test_character_accuracy_matches(matches, expected_dist): (Distance(), 1), (Distance(match=1), 1), (Distance(replace=1), 0), + (Distance(delete=1), 0), + (Distance(insert=1), -1), (Distance(match=1, insert=1), 0), (Distance(match=1, insert=2), 1 - 2 / 1), (Distance(match=2, insert=1), 0.5), @@ -377,9 +379,42 @@ def test_initialize_lines(): assert lines == [line3, line1, line2] -@pytest.mark.xfail -def test_combine_lines(): - assert False +@pytest.mark.parametrize( + "matches,expected_gt,expected_ocr,expected_ops", + [ + ([], [], [], []), + ( + [Match(gt=Part(text="aaa"), ocr=Part(text="aaa"), dist=Distance(), ops=[])], + ["aaa"], + ["aaa"], + [[]], + ), + ( + [ + Match( + gt=Part(text="aaa", line=1), + ocr=Part(text="aaa"), + dist=Distance(), + ops=[], + ), + Match( + gt=Part(text="bbb", line=2), + ocr=Part(text="bbc"), + dist=Distance(), + ops=[["replace", 2]], + ), + ], + ["\n", "aaa", "\n", "bbb"], + ["\n", "aaa", "\n", "bbc"], + [[], [], [], [["replace", 2]]], + ), + ], +) +def test_split_matches(matches, expected_gt, expected_ocr, expected_ops): + gt_segments, ocr_segments, ops = split_matches(matches) + assert gt_segments == expected_gt + assert ocr_segments == expected_ocr + assert ops == expected_ops @pytest.mark.parametrize( diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index 9d52329..bcd30b3 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -1,4 +1,5 @@ import json +from itertools import combinations import pytest from .util import working_directory @@ -7,9 +8,19 @@ @pytest.mark.integration -def test_cli_json(tmp_path): +@pytest.mark.parametrize( + "metrics", + [ + *(("",), ("cer",), ("wer",), ("fca",)), + *combinations(("cer", "wer", "fca"), 2), + ("cer", "wer", "fca"), + ], +) +def test_cli_json(metrics, tmp_path): """Test that the cli/process() yields a loadable JSON report""" + expected_values = {"cer": 0.2, "wer": 1.0, "fca": 0.8} + with working_directory(str(tmp_path)): with open("gt.txt", "w") as gtf: gtf.write("AAAAA") @@ -18,12 +29,18 @@ def test_cli_json(tmp_path): with open("gt.txt", "r") as gtf: print(gtf.read()) - process("gt.txt", "ocr.txt", "report") + + process("gt.txt", "ocr.txt", "report", metrics=",".join(metrics)) + with open("report.json", "r") as jsonf: print(jsonf.read()) with open("report.json", "r") as jsonf: j = json.load(jsonf) - assert j["cer"] == pytest.approx(0.2) + for metric, expected_value in expected_values.items(): + if metric in metrics: + assert j[metric] == pytest.approx(expected_values[metric]) + else: + assert metric not in j.keys() @pytest.mark.integration @@ -36,7 +53,8 @@ def test_cli_json_cer_is_infinity(tmp_path): with open("ocr.txt", "w") as ocrf: ocrf.write("Not important") - process("gt.txt", "ocr.txt", "report") + process("gt.txt", "ocr.txt", "report", metrics="cer,wer,fca") with open("report.json", "r") as jsonf: j = json.load(jsonf) assert j["cer"] == pytest.approx(float("inf")) + assert j["fca"] == pytest.approx(-13) diff --git a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py new file mode 100644 index 0000000..abde26c --- /dev/null +++ b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py @@ -0,0 +1,50 @@ +import os + +import pytest +from lxml import etree as ET + +from .. import distance, page_text +from .. import flexible_character_accuracy, split_matches + +data_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "data", "table-order" +) + + +@pytest.mark.parametrize("file", ["table-order-0002.xml", "table-no-reading-order.xml"]) +@pytest.mark.integration +def test_fac_ignoring_reading_order(file): + expected = "1\n2\n3\n4\n5\n6\n7\n8\n9" + + gt = page_text(ET.parse(os.path.join(data_dir, "table-order-0001.xml"))) + assert gt == expected + + ocr = page_text(ET.parse(os.path.join(data_dir, file))) + assert distance(gt, ocr) > 0 + + fac, matches = flexible_character_accuracy(gt, ocr) + assert fac == pytest.approx(1.0) + + gt_segments, ocr_segments, ops = split_matches(matches) + assert not any(ops) + assert "".join(gt_segments) == expected + assert "".join(ocr_segments) == expected + + +@pytest.mark.parametrize( + "file,expected_text", + [ + ("table-order-0001.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"), + ("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"), + ("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"), + ("table-unordered.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"), + ], +) +@pytest.mark.integration +def test_reading_order_settings(file, expected_text): + if "table-unordered.xml" == file: + with pytest.raises(NotImplementedError): + page_text(ET.parse(os.path.join(data_dir, file))) + else: + ocr = page_text(ET.parse(os.path.join(data_dir, file))) + assert ocr == expected_text From 750ad00d1b52cabc71fa7180fa04a7911ff38cf2 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Wed, 11 Nov 2020 17:21:56 +0100 Subject: [PATCH 08/21] Add tooltips to fca report --- qurator/dinglehopper/__init__.py | 6 ++- qurator/dinglehopper/cli.py | 41 +++++++++++++------ .../flexible_character_accuracy.py | 24 +++++++++-- .../tests/test_flexible_character_accuracy.py | 31 +++++++++++++- 4 files changed, 85 insertions(+), 17 deletions(-) diff --git a/qurator/dinglehopper/__init__.py b/qurator/dinglehopper/__init__.py index fd309dc..dc45a8f 100644 --- a/qurator/dinglehopper/__init__.py +++ b/qurator/dinglehopper/__init__.py @@ -3,4 +3,8 @@ from .character_error_rate import * from .word_error_rate import * from .align import * -from .flexible_character_accuracy import flexible_character_accuracy, split_matches +from .flexible_character_accuracy import ( + flexible_character_accuracy, + split_matches, + Match, +) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index b717618..46fc0b0 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -14,7 +14,7 @@ from .config import Config -def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, ops=None): +def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None): gtx = "" ocrx = "" @@ -42,7 +42,27 @@ def format_thing(t, css_classes=None, id_=None): else: return "{html_t}".format(html_t=html_t) - if isinstance(gt_in, ExtractedText): + ops, ocr_ids = None, None + if matches: + gt_things, ocr_things, ops = split_matches(matches) + # we have to reconstruct the order of the ocr because we mixed it for fca + ocr_lines = [match.ocr for match in matches] + ocr_lines_sorted = sorted(ocr_lines, key=lambda x: x.line + x.start / 10000) + + ocr_line_region_id = {} + pos = 0 + for ocr_line in ocr_lines_sorted: + if ocr_line.line not in ocr_line_region_id.keys(): + ocr_line_region_id[ocr_line.line] = ocr_in.segment_id_for_pos(pos) + pos += ocr_line.length + + ocr_ids = {None: None} + pos = 0 + for ocr_line in ocr_lines: + for _ in ocr_line.text: + ocr_ids[pos] = ocr_line_region_id[ocr_line.line] + pos += 1 + elif isinstance(gt_in, ExtractedText): if not isinstance(ocr_in, ExtractedText): raise TypeError() # XXX splitting should be done in ExtractedText @@ -61,10 +81,13 @@ def format_thing(t, css_classes=None, id_=None): if g != o: css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k) if isinstance(gt_in, ExtractedText): - gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None - ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None # Deletions and inserts only produce one id + None, UI must # support this, i.e. display for the one id produced + gt_id = gt_in.segment_id_for_pos(g_pos) if g else None + if ocr_ids: + ocr_id = ocr_ids[o_pos] + else: + ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None gtx += joiner + format_thing(g, css_classes, gt_id) ocrx += joiner + format_thing(o, css_classes, ocr_id) @@ -111,15 +134,9 @@ def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="regio gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯" ) if "fca" in metrics: - fca, fca_matches = flexible_character_accuracy(gt_text.text, ocr_text.text) - fca_gt_segments, fca_ocr_segments, ops = split_matches(fca_matches) + fca, fca_matches = flexible_character_accuracy(gt_text, ocr_text) fca_diff_report = gen_diff_report( - fca_gt_segments, - fca_ocr_segments, - css_prefix="c", - joiner="", - none="·", - ops=ops, + gt_text, ocr_text, css_prefix="c", joiner="", none="·", matches=fca_matches ) def json_float(value): diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 7865dd1..349384c 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -17,7 +17,9 @@ from itertools import product, takewhile from typing import List, Tuple, Optional -from . import editops +from multimethod import multimethod + +from . import editops, ExtractedText if sys.version_info.minor == 5: from .flexible_character_accuracy_ds_35 import ( @@ -35,6 +37,22 @@ ) +@multimethod +def flexible_character_accuracy( + gt: ExtractedText, ocr: ExtractedText +) -> Tuple[float, List[Match]]: + """Calculate the flexible character accuracy. + + Reference: contains steps 1-7 of the flexible character accuracy algorithm. + + :param gt: The ground truth text. + :param ocr: The text to compare the ground truth with. + :return: Score between 0 and 1 and match objects. + """ + return flexible_character_accuracy(gt.text, ocr.text) + + +@multimethod def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]: """Calculate the flexible character accuracy. @@ -359,7 +377,7 @@ def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List :param matches: List of match objects. :return: List of ground truth segments, ocr segments and editing operations. """ - matches = sorted(matches, key=lambda x: x.gt.line + x.gt.start / 10000) + matches = sorted(matches, key=lambda m: m.gt.line + m.gt.start / 10000) line = 0 gt, ocr, ops = [], [], [] for match in matches: @@ -410,4 +428,4 @@ def substring(self, rel_start: int = 0, rel_end: int = None) -> "Part": """ text = self.text[rel_start:rel_end] start = self.start + rel_start - return Part(text=text, line=self.line, start=start) + return Part(**{**self._asdict(), "text": text, "start": start}) diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index 2f6d702..3ade597 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -10,6 +10,7 @@ """ import pytest +from lxml import etree as ET from ..flexible_character_accuracy import * @@ -101,11 +102,39 @@ def extended_case_to_text(gt, ocr): @pytest.mark.parametrize(CASE_ARGS, [*SIMPLE_CASES, *COMPLEX_CASES]) -def test_flexible_character_accuracy_simple(gt, ocr, first_line_score, all_line_score): +def test_flexible_character_accuracy_str(gt, ocr, first_line_score, all_line_score): score, _ = flexible_character_accuracy(gt, ocr) assert score == pytest.approx(all_line_score) +@pytest.mark.parametrize(CASE_ARGS, [*SIMPLE_CASES, *COMPLEX_CASES]) +def test_flexible_character_accuracy_xml(gt, ocr, first_line_score, all_line_score): + def get_extracted_text(text: str): + xml = '' + ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" + + textline_tmpl = ( + '{1}' + "" + ) + xml_tmpl = '{0}{2}' + + textlines = [ + textline_tmpl.format(i, line) for i, line in enumerate(text.splitlines()) + ] + xml_text = xml_tmpl.format(xml, ns, "".join(textlines)) + root = ET.fromstring(xml_text) + extracted_text = ExtractedText.from_text_segment( + root, {"page": ns}, textequiv_level="line" + ) + return extracted_text + + gt_text = get_extracted_text(gt) + ocr_text = get_extracted_text(ocr) + score, _ = flexible_character_accuracy(gt_text, ocr_text) + assert score == pytest.approx(all_line_score) + + @pytest.mark.parametrize( "config,ocr", [ From 1bc7ef6c8b06daa169f7e4e61043b5fc060f7d7e Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Thu, 12 Nov 2020 16:23:04 +0100 Subject: [PATCH 09/21] Correct report for fca As the fca implementation already knows the editing operations for each segment we use a different sequence alignment method. --- qurator/dinglehopper/align.py | 8 ++++++++ qurator/dinglehopper/cli.py | 8 +++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py index ede75f4..08bb3f5 100644 --- a/qurator/dinglehopper/align.py +++ b/qurator/dinglehopper/align.py @@ -8,6 +8,14 @@ def align(t1, t2): return seq_align(s1, s2) +def seq_align_linewise(s1, s2, ops): + """Align two lists of lines linewise.""" + assert len(s1) == len(s2) + assert len(s2) == len(ops) + for l1, l2, line_ops in zip(s1, s2, ops): + yield from seq_align(l1, l2, ops=line_ops) + + def seq_align(s1, s2, ops=None): """Align general sequences.""" s1 = list(s1) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 46fc0b0..9a2a837 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -8,7 +8,7 @@ from .character_error_rate import character_error_rate_n from .flexible_character_accuracy import flexible_character_accuracy, split_matches from .word_error_rate import word_error_rate_n, words_normalized -from .align import seq_align +from .align import seq_align, seq_align_linewise from .extracted_text import ExtractedText from .ocr_files import extract from .config import Config @@ -43,7 +43,9 @@ def format_thing(t, css_classes=None, id_=None): return "{html_t}".format(html_t=html_t) ops, ocr_ids = None, None + seq_align_fun = seq_align if matches: + seq_align_fun = seq_align_linewise gt_things, ocr_things, ops = split_matches(matches) # we have to reconstruct the order of the ocr because we mixed it for fca ocr_lines = [match.ocr for match in matches] @@ -74,7 +76,7 @@ def format_thing(t, css_classes=None, id_=None): g_pos = 0 o_pos = 0 - for k, (g, o) in enumerate(seq_align(gt_things, ocr_things, ops=ops)): + for k, (g, o) in enumerate(seq_align_fun(gt_things, ocr_things, ops=ops)): css_classes = None gt_id = None ocr_id = None @@ -85,7 +87,7 @@ def format_thing(t, css_classes=None, id_=None): # support this, i.e. display for the one id produced gt_id = gt_in.segment_id_for_pos(g_pos) if g else None if ocr_ids: - ocr_id = ocr_ids[o_pos] + ocr_id = ocr_ids.get(o_pos, None) else: ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None From cac437afbf623bd2bae193df8e1c885d1bdda0fb Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Thu, 12 Nov 2020 18:38:16 +0100 Subject: [PATCH 10/21] Evaluate some performance issues --- .../flexible_character_accuracy.py | 26 ++++++------ .../tests/test_flexible_character_accuracy.py | 3 +- .../test_integ_flexible_character_accuracy.py | 42 ++++++++++++++++++- 3 files changed, 55 insertions(+), 16 deletions(-) diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 349384c..ed72764 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -148,13 +148,8 @@ def match_longest_gt_lines( # Step 4 of the flexible character accuracy algorithm. # Remove on full match or split. - if best_match and best_gt: - splitted = remove_or_split(best_gt, best_match.gt, gt_lines) - if splitted: - # according to the paper the match is not put back, we deviate... - gt_lines.append(best_match.gt) - best_match = None - if best_match and best_ocr: + if best_match: + remove_or_split(best_gt, best_match.gt, gt_lines) remove_or_split(best_ocr, best_match.ocr, ocr_lines) return best_match @@ -230,13 +225,9 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: for j in range(0, max(1, -1 * length_diff + 1)) ] - # add full line and empty line match - gt_parts = [*gt_parts, (0, gt_line), (0, gt_line)] - ocr_parts = [ - *ocr_parts, - (0, ocr_line), - (0, Part(text="", line=gt_line.line, start=gt_line.start)), - ] + # add full line + gt_parts = [*gt_parts, (0, gt_line)] + ocr_parts = [*ocr_parts, (0, ocr_line)] for i, gt_part in gt_parts: for j, ocr_part in ocr_parts: @@ -246,6 +237,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: min_edit_dist = edit_dist best_match = match best_i, best_j = i, j + # elongate at the end for handling deletes if best_match and (best_match.dist.delete or best_match.dist.replace): part_length = best_match.gt.length additional_length = best_match.dist.delete + best_match.dist.replace @@ -258,6 +250,12 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: if edit_dist < min_edit_dist: min_edit_dist = edit_dist best_match = match + # is delete a better option? + match = distance(gt_line, Part(text="", line=ocr_line.line, start=ocr_line.start)) + edit_dist = score_edit_distance(match) + if edit_dist < min_edit_dist: + best_match = match + return best_match diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index 3ade597..9529c87 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -29,7 +29,7 @@ ] COMPLEX_CASES = [ - ("accc", "a\nbb\nccc", 0, 1 - 2 / 4), + ("accc", "a\nbb\nccc", 1, 1 - 2 / 4), ("aaa\nbbb\nccc", "bbb", 1, 1 - 6 / 9), ] @@ -135,6 +135,7 @@ def get_extracted_text(text: str): assert score == pytest.approx(all_line_score) +@pytest.mark.xfail(reason="Need to adapt performance details.") @pytest.mark.parametrize( "config,ocr", [ diff --git a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py index abde26c..4327680 100644 --- a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py @@ -3,7 +3,7 @@ import pytest from lxml import etree as ET -from .. import distance, page_text +from .. import distance, page_text, extract from .. import flexible_character_accuracy, split_matches data_dir = os.path.join( @@ -48,3 +48,43 @@ def test_reading_order_settings(file, expected_text): else: ocr = page_text(ET.parse(os.path.join(data_dir, file))) assert ocr == expected_text + + +@pytest.mark.skip(reason="Need to check performance first.") +@pytest.mark.integration +@pytest.mark.parametrize( + "gt,ocr,expected", + [ + ( + "brochrnx_73075507X/00000139.gt.page.xml", + "brochrnx_73075507X/00000139.ocrd-tess.ocr.page.xml", + 0.93, + ), + ( + "actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml", + "actevedef_718448162/OCR-D-OCR-TESS/OCR-D-OCR-TESS_0001.xml", + 0.96, + ), + ( + "actevedef_718448162/OCR-D-GT-PAGE/00000024.page.xml", + "actevedef_718448162/OCR-D-OCR-CALAMARI/OCR-D-OCR-CALAMARI_0001.xml", + 0.97, + ), + ( + "lorem-ipsum/lorem-ipsum-scan.gt.page.xml", + "lorem-ipsum/lorem-ipsum-scan.ocr.tesseract.alto.xml", + 1.0, + ), + ( + "lorem-ipsum/lorem-ipsum-scan-bad.gt.page.xml", + "lorem-ipsum/lorem-ipsum-scan-bad.ocr.tesseract.alto.xml", + 0.98, + ), + ], +) +def test_ocr_files(gt, ocr, expected): + data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") + gt_et = extract(os.path.join(data_dir, gt)) + ocr_et = extract(os.path.join(data_dir, ocr)) + score, _ = flexible_character_accuracy(gt_et, ocr_et) + assert score == pytest.approx(expected, abs=0.01) From fd6f57a263dcf3aed6cfdc1cdc716149bf8f7a95 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Fri, 13 Nov 2020 08:54:21 +0100 Subject: [PATCH 11/21] Fix broken build on Python 3.5 --- qurator/dinglehopper/flexible_character_accuracy_ds_35.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/qurator/dinglehopper/flexible_character_accuracy_ds_35.py b/qurator/dinglehopper/flexible_character_accuracy_ds_35.py index 61b924e..17384ac 100644 --- a/qurator/dinglehopper/flexible_character_accuracy_ds_35.py +++ b/qurator/dinglehopper/flexible_character_accuracy_ds_35.py @@ -24,6 +24,13 @@ def __eq__(self, other): def __hash__(self): return hash(self.text) ^ hash(self.line) ^ hash(self.start) + def _asdict(self) -> Dict: + return { + "text": self.text, + "line": self.line, + "start": self.start, + } + class Distance: def __init__( From c9219cbacd58c858191944a0811d82f947cea0bf Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Fri, 13 Nov 2020 09:01:33 +0100 Subject: [PATCH 12/21] Make sure that 0 cer and wer are reported --- qurator/dinglehopper/templates/report.html.j2 | 6 +++--- qurator/dinglehopper/templates/report.json.j2 | 10 +++++----- .../tests/test_integ_cli_valid_json.py | 15 +++++++++++++++ 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/qurator/dinglehopper/templates/report.html.j2 b/qurator/dinglehopper/templates/report.html.j2 index a194a5a..be764db 100644 --- a/qurator/dinglehopper/templates/report.html.j2 +++ b/qurator/dinglehopper/templates/report.html.j2 @@ -40,13 +40,13 @@ {% if metrics %}

Metrics

- {% if cer %} + {% if cer is not none %}

CER: {{ cer|round(4) }}

{% endif %} - {% if wer %} + {% if wer is not none %}

WER: {{ wer|round(4) }}

{% endif %} - {% if fca %} + {% if fca is not none %}

FCA: {{ fca|round(4) }}

{% endif %} {% endif %} diff --git a/qurator/dinglehopper/templates/report.json.j2 b/qurator/dinglehopper/templates/report.json.j2 index b59fbba..161d342 100644 --- a/qurator/dinglehopper/templates/report.json.j2 +++ b/qurator/dinglehopper/templates/report.json.j2 @@ -1,10 +1,10 @@ { {% if metrics %} - {% if cer %}"cer": {{ cer|json_float }},{% endif %} - {% if wer %}"wer": {{ wer|json_float }},{% endif %} - {% if fca %}"fca": {{ fca|json_float }},{% endif %} - {% if n_characters %}"n_characters": {{ n_characters }},{% endif %} - {% if n_words %}"n_words": {{ n_words }},{% endif %} + {% if cer is not none %}"cer": {{ cer|json_float }},{% endif %} + {% if wer is not none %}"wer": {{ wer|json_float }},{% endif %} + {% if fca is not none %}"fca": {{ fca|json_float }},{% endif %} + {% if n_characters is not none %}"n_characters": {{ n_characters }},{% endif %} + {% if n_words is not none %}"n_words": {{ n_words }},{% endif %} {% endif %} "gt": "{{ gt }}", "ocr": "{{ ocr }}" diff --git a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py index bcd30b3..1092a92 100644 --- a/qurator/dinglehopper/tests/test_integ_cli_valid_json.py +++ b/qurator/dinglehopper/tests/test_integ_cli_valid_json.py @@ -58,3 +58,18 @@ def test_cli_json_cer_is_infinity(tmp_path): j = json.load(jsonf) assert j["cer"] == pytest.approx(float("inf")) assert j["fca"] == pytest.approx(-13) + + +def test_cli_json_cer_0_in_report(tmp_path): + """Test that the cli/process() yields a loadable JSON report when CER == 0""" + + with working_directory(str(tmp_path)): + with open("gt.txt", "w") as gtf: + gtf.write("Lorem Ipsum") + + process("gt.txt", "gt.txt", "report", metrics="cer,wer,fca") + with open("report.json", "r") as jsonf: + j = json.load(jsonf) + assert j["cer"] == pytest.approx(0) + assert j["wer"] == pytest.approx(0) + assert j["fca"] == pytest.approx(1) From 0ef7810dd0967ba86a94b559e125fbe6fda664db Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Fri, 13 Nov 2020 11:45:55 +0100 Subject: [PATCH 13/21] Reduce number of splits for short (one char) elements --- qurator/dinglehopper/flexible_character_accuracy.py | 10 ++++++---- .../tests/test_flexible_character_accuracy.py | 4 +++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index ed72764..884bf1b 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -145,9 +145,11 @@ def match_longest_gt_lines( score = 0 if not match else character_accuracy(match.dist) if score > best_score: best_score, best_match, best_gt, best_ocr = score, match, gt_line, ocr_line + # early breaking: we only need one perfect fit + if best_score >= 1: + break # Step 4 of the flexible character accuracy algorithm. - # Remove on full match or split. if best_match: remove_or_split(best_gt, best_match.gt, gt_lines) remove_or_split(best_ocr, best_match.ocr, ocr_lines) @@ -168,7 +170,7 @@ def match_gt_line( """ min_penalty = float("inf") best_match, best_ocr = None, None - for ocr_line in [*ocr_lines]: + for ocr_line in ocr_lines: match = match_lines(gt_line, ocr_line) if match: penalty = calculate_penalty(gt_line, ocr_line, match, coef) @@ -233,7 +235,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: for j, ocr_part in ocr_parts: match = distance(gt_part, ocr_part) edit_dist = score_edit_distance(match) - if edit_dist < min_edit_dist: + if edit_dist < min_edit_dist and match.dist.replace < min_length: min_edit_dist = edit_dist best_match = match best_i, best_j = i, j @@ -247,7 +249,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: ocr_line.substring(rel_start=best_j, rel_end=best_j + k), ) edit_dist = score_edit_distance(match) - if edit_dist < min_edit_dist: + if edit_dist < min_edit_dist and match.dist.replace < min_length: min_edit_dist = edit_dist best_match = match # is delete a better option? diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index 9529c87..6f30b71 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -26,6 +26,7 @@ ("bbb", "aaa\nbbb\nccc", 1, 1 - 6 / 3), ("a", "a\nbb\nccc", 1, 1 - 5 / 1), ("bb", "a\nbb\nccc", 1, 1 - 4 / 2), + ("abcd", "ab\ne", 1, 1 - 3 / 4), ] COMPLEX_CASES = [ @@ -135,7 +136,6 @@ def get_extracted_text(text: str): assert score == pytest.approx(all_line_score) -@pytest.mark.xfail(reason="Need to adapt performance details.") @pytest.mark.parametrize( "config,ocr", [ @@ -273,6 +273,8 @@ def test_remove_or_split(original, match, expected_lines): [ *SIMPLE_EDITS, (Part(text="a"), Part(text="b"), Distance(delete=1)), + (Part(text="ab"), Part(text="c"), Distance(delete=2)), + (Part(text="abc"), Part(text="d"), Distance(delete=3)), (Part(text="aaa"), Part(text="bbb"), Distance(delete=3)), (Part(text="aaabbbaaa"), Part(text="bbb"), Distance(match=3)), (Part(text="bbb"), Part(text="aaabbbaaa"), Distance(match=3)), From b24d8d5664edfe5c576fdd462a140b662905a70b Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Fri, 13 Nov 2020 15:33:06 +0100 Subject: [PATCH 14/21] Performance increases Temporarily switch to the c-implementation of python-levenshtein for editops calculatation. Also added some variables, caching and type changes for performance gains. --- .../flexible_character_accuracy.py | 46 +++++++++++++------ .../tests/test_flexible_character_accuracy.py | 6 +-- .../test_integ_flexible_character_accuracy.py | 1 - requirements.txt | 1 + 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 884bf1b..607cd39 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -17,9 +17,10 @@ from itertools import product, takewhile from typing import List, Tuple, Optional +from Levenshtein import editops from multimethod import multimethod -from . import editops, ExtractedText +from . import ExtractedText if sys.version_info.minor == 5: from .flexible_character_accuracy_ds_35 import ( @@ -170,10 +171,21 @@ def match_gt_line( """ min_penalty = float("inf") best_match, best_ocr = None, None + gt_line_length = gt_line.length + gt_line_start = gt_line.start for ocr_line in ocr_lines: match = match_lines(gt_line, ocr_line) if match: - penalty = calculate_penalty(gt_line, ocr_line, match, coef) + penalty = calculate_penalty( + gt_line_length, + ocr_line.length, + gt_line_start, + ocr_line.start, + match.gt.start, + match.ocr.start, + match.dist, + coef, + ) if penalty < min_penalty: min_penalty, best_match, best_ocr = penalty, match, ocr_line return best_match, best_ocr @@ -234,7 +246,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: for i, gt_part in gt_parts: for j, ocr_part in ocr_parts: match = distance(gt_part, ocr_part) - edit_dist = score_edit_distance(match) + edit_dist = score_edit_distance(match.dist) if edit_dist < min_edit_dist and match.dist.replace < min_length: min_edit_dist = edit_dist best_match = match @@ -248,13 +260,13 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: gt_line.substring(rel_start=best_i, rel_end=best_i + k), ocr_line.substring(rel_start=best_j, rel_end=best_j + k), ) - edit_dist = score_edit_distance(match) + edit_dist = score_edit_distance(match.dist) if edit_dist < min_edit_dist and match.dist.replace < min_length: min_edit_dist = edit_dist best_match = match # is delete a better option? match = distance(gt_line, Part(text="", line=ocr_line.line, start=ocr_line.start)) - edit_dist = score_edit_distance(match) + edit_dist = score_edit_distance(match.dist) if edit_dist < min_edit_dist: best_match = match @@ -278,18 +290,26 @@ def distance(gt: "Part", ocr: "Part") -> Match: return Match(gt=gt, ocr=ocr, dist=Distance(**edits), ops=ops) -def score_edit_distance(match: Match) -> int: +def score_edit_distance(dist: Distance) -> int: """Calculate edit distance for a match. Formula: $deletes + inserts + 2 * replacements$ :return: Sum of deletes, inserts and replacements. """ - return match.dist.delete + match.dist.insert + 2 * match.dist.replace + return dist.delete + dist.insert + 2 * dist.replace +@lru_cache(1000000) def calculate_penalty( - gt: "Part", ocr: "Part", match: Match, coef: Coefficients + gt_length: int, + ocr_length: int, + gt_start: int, + ocr_start: int, + gt_match_start: int, + ocr_match_start: int, + dist: Distance, + coef: Coefficients, ) -> float: """Calculate the penalty for a given match. @@ -297,12 +317,12 @@ def calculate_penalty( :return: Penalty for the given match. """ - min_edit_dist = score_edit_distance(match) - length_diff = abs(gt.length - ocr.length) - substring_length = min(gt.length, ocr.length) + min_edit_dist = score_edit_distance(dist) + length_diff = abs(gt_length - ocr_length) + substring_length = min(gt_length, ocr_length) offset = 0.0 if length_diff > 1: - substring_pos = max(match.gt.start - gt.start, match.ocr.start - ocr.start) + substring_pos = max(gt_match_start - gt_start, ocr_match_start - ocr_start) offset = length_diff / 2 - abs(substring_pos - length_diff / 2) return ( min_edit_dist * coef.edit_dist @@ -428,4 +448,4 @@ def substring(self, rel_start: int = 0, rel_end: int = None) -> "Part": """ text = self.text[rel_start:rel_end] start = self.start + rel_start - return Part(**{**self._asdict(), "text": text, "start": start}) + return Part(line=self.line, text=text, start=start) diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index 6f30b71..2cbfdbd 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -70,9 +70,9 @@ (Part(text="a"), Part(text="a"), Distance(match=1)), (Part(text="aaa"), Part(text="aaa"), Distance(match=3)), ( - Part(text="abcd"), - Part(text="beed"), - Distance(match=2, replace=1, insert=1, delete=1), + Part(text="abbbbcd"), + Part(text="bbbbede"), + Distance(match=5, replace=1, insert=1, delete=1), ), ] diff --git a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py index 4327680..dec0799 100644 --- a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py @@ -50,7 +50,6 @@ def test_reading_order_settings(file, expected_text): assert ocr == expected_text -@pytest.mark.skip(reason="Need to check performance first.") @pytest.mark.integration @pytest.mark.parametrize( "gt,ocr,expected", diff --git a/requirements.txt b/requirements.txt index 61499dc..1ebf4f4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ ocrd >= 2.20.1 attrs multimethod == 1.3 # latest version to officially support Python 3.5 tqdm +python-levenshtein From 0dd5fc0ee59993aebb31447bc4a77cf085989f0c Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Mon, 23 Nov 2020 09:18:22 +0100 Subject: [PATCH 15/21] Small corrections --- .../flexible_character_accuracy.py | 86 ++++++++++--------- .../tests/test_flexible_character_accuracy.py | 6 +- .../test_integ_flexible_character_accuracy.py | 10 ++- 3 files changed, 54 insertions(+), 48 deletions(-) diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 607cd39..f44c114 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -46,8 +46,8 @@ def flexible_character_accuracy( Reference: contains steps 1-7 of the flexible character accuracy algorithm. - :param gt: The ground truth text. - :param ocr: The text to compare the ground truth with. + :param gt: The ground truth ExtractedText object. + :param ocr: The ExtractedText object to compare the ground truth with. :return: Score between 0 and 1 and match objects. """ return flexible_character_accuracy(gt.text, ocr.text) @@ -66,11 +66,11 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]: best_score = -float("inf") best_matches = [] - # TODO: this should be configurable + # TODO: should this be configurable? combinations = product( range(15, 31, 5), range(0, 24, 3), range(0, 4, 1), range(0, 6, 1) ) - # TODO: place to parallelize the algorithm + # TODO: place to parallelize the algorithm? for (edit_dist, length_diff, offset, length) in combinations: coef = Coefficients( edit_dist=edit_dist, length_diff=length_diff, offset=offset, length=length @@ -89,7 +89,7 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]: def match_with_coefficients(gt: str, ocr: str, coef: Coefficients) -> List[Match]: - """Match ground truth with ocr and considers a given set of coefficients. + """Match ground truth with ocr and consider a given set of coefficients. Reference: contains steps 1 - 6 of the flexible character accuracy algorithm. @@ -128,7 +128,8 @@ def match_longest_gt_lines( """Find the best match for the longest line(s) in ground truth. The longest lines in ground truth are matched against lines in ocr to find the - best matching pair. This pair is then either considered a match on full line + best matching pair. This pair is then either considered a match on a full line + or the line(s) is splitted and the non matching parts are added back to the list. Reference: contains steps 3 and 4 of the flexible character accuracy algorithm. @@ -139,11 +140,12 @@ def match_longest_gt_lines( return best_match # Step 3 of the flexible character accuracy algorithm (variation). - # Instead of the longest line we take all longest lines with equal length. - length = min(gt_lines[0].length, ocr_lines[0].length) - for gt_line in takewhile(lambda line: line.length >= length, gt_lines): + # We do not only take the longest line from ground truth but decide on a length + # threshold and take all lines from ground truth bigger than the threshold. + length_threshold = min(gt_lines[0].length, ocr_lines[0].length) - 1 + for gt_line in takewhile(lambda line: line.length > length_threshold, gt_lines): match, ocr_line = match_gt_line(gt_line, ocr_lines, coef) - score = 0 if not match else character_accuracy(match.dist) + score = -float("inf") if not match else character_accuracy(match.dist) if score > best_score: best_score, best_match, best_gt, best_ocr = score, match, gt_line, ocr_line # early breaking: we only need one perfect fit @@ -191,34 +193,17 @@ def match_gt_line( return best_match, best_ocr -def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> bool: - """Removes the matched line or splits it into parts. - - Reference: contains step 4 of the flexible character accuracy algorithm. - - :return: True if line was splitted. - """ - splitted = False - del lines[lines.index(original)] - if match.length < original.length: - lines.extend(original.split(match)) - # sorting for ocr is not mentioned in the paper, but is used as tie breaking =) - lines.sort(key=lambda x: x.length, reverse=True) - splitted = True - return splitted - - -@lru_cache(maxsize=1000000) +@lru_cache(maxsize=10000) def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: - """Matches two lines searching for a local alignment. + """Matches two lines searching for a naive local alignment. The shorter line is moved along the longer line until the editing distance is minimized. - Reference: see figure 2 in the paper. + Reference: see figure 2 in the doi:10.1016/j.patrec.2020.02.003. TODO: make distance function configurable? - TODO: rethink @lru_cache + TODO: use @cache annotation in Python 3.9? :return: Match object if one is found. """ @@ -273,14 +258,14 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: return best_match -@lru_cache(maxsize=1000000) +@lru_cache(maxsize=10000) def distance(gt: "Part", ocr: "Part") -> Match: """Calculate the editing distance between the two lines. Using the already available `editops()` function with the Levenshtein distance. - TODO: replace with @cache annotation in Python 3.9 - TODO: rethink @lru_cache + TODO: use @cache annotation in Python 3.9? + TODO: wait for qurator-spk/dinglehopper#48 for efficient editops. :return: Match object containing the lines and the editing operations. """ @@ -300,7 +285,7 @@ def score_edit_distance(dist: Distance) -> int: return dist.delete + dist.insert + 2 * dist.replace -@lru_cache(1000000) +@lru_cache(10000) def calculate_penalty( gt_length: int, ocr_length: int, @@ -336,7 +321,6 @@ def character_accuracy_for_matches(matches: List[Match]) -> float: """Character accuracy of a full text represented by a list of matches. See other `character_accuracy` for details. - """ agg = reduce( lambda acc, match: acc + Counter(match.dist._asdict()), matches, Counter() @@ -355,7 +339,7 @@ def character_accuracy(edits: Distance) -> float: Errors are replacements, deletes and inserts. - Note that is is possible to have more errors than characters in which case the + Note that it is possible to have more errors than characters in which case the character accuracy turns negative. Comparing two empty strings (having no edits) results in a character accuracy of 1. @@ -391,10 +375,30 @@ def initialize_lines(text: str) -> List["Part"]: return lines -def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List]]: +def remove_or_split(original: "Part", match: "Part", lines: List["Part"]) -> bool: + """Removes the matched line or splits it into parts. + + Reference: contains step 4 of the flexible character accuracy algorithm. + + :return: True if line was splitted. + """ + splitted = False + del lines[lines.index(original)] + if match.length < original.length: + lines.extend(original.split(match)) + # sorting for ocr is not mentioned in the paper, but is used as tie breaking =) + lines.sort(key=lambda x: x.length, reverse=True) + splitted = True + return splitted + + +def split_matches( + matches: List[Match], linesep="\n" +) -> Tuple[List[str], List[str], List[List]]: """Extracts text segments and editing operations in separate lists. :param matches: List of match objects. + :param linesep: Character(s) or line separation. :return: List of ground truth segments, ocr segments and editing operations. """ matches = sorted(matches, key=lambda m: m.gt.line + m.gt.start / 10000) @@ -402,9 +406,9 @@ def split_matches(matches: List[Match]) -> Tuple[List[str], List[str], List[List gt, ocr, ops = [], [], [] for match in matches: if match.gt.line > line: - gt.append("\n") - ocr.append("\n") - ops.append([]) + gt.append(linesep) + ocr.append(linesep) + ops.extend([[]] * len(linesep)) line = match.gt.line gt.append(match.gt.text) ocr.append(match.ocr.text) diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index 2cbfdbd..ad62798 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -80,7 +80,7 @@ def extended_case_to_text(gt, ocr): """Generate sentence from reading order encoding. - See figure 4 in 10.1016/j.patrec.2020.02.003 + See figure 4 in 10.1016/j.patrec.2020.02.003. """ sentence = ( "Eight", @@ -159,7 +159,7 @@ def get_extracted_text(text: str): ], ) def test_flexible_character_accuracy(config, ocr): - """Tests from figure 3 in the paper.""" + """Tests from figure 3 in the 10.1016/j.patrec.2020.02.003.""" gt = ( '"I have\nno special\ntalent.\n' 'I am only\npassionately\ncurious."\n' @@ -199,7 +199,7 @@ def test_flexible_character_accuracy(config, ocr): def test_flexible_character_accuracy_extended( gt, ocr, first_line_score, all_line_score ): - """Tests from figure 4 in the paper.""" + """Tests from figure 4 in the 10.1016/j.patrec.2020.02.003.""" gt_sentence, ocr_sentence = extended_case_to_text(gt, ocr) result, _ = flexible_character_accuracy(gt_sentence, ocr_sentence) assert result == pytest.approx(all_line_score, abs=0.001) diff --git a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py index dec0799..3b3ecba 100644 --- a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py @@ -6,14 +6,13 @@ from .. import distance, page_text, extract from .. import flexible_character_accuracy, split_matches -data_dir = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "data", "table-order" -) - @pytest.mark.parametrize("file", ["table-order-0002.xml", "table-no-reading-order.xml"]) @pytest.mark.integration def test_fac_ignoring_reading_order(file): + data_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "data", "table-order" + ) expected = "1\n2\n3\n4\n5\n6\n7\n8\n9" gt = page_text(ET.parse(os.path.join(data_dir, "table-order-0001.xml"))) @@ -42,6 +41,9 @@ def test_fac_ignoring_reading_order(file): ) @pytest.mark.integration def test_reading_order_settings(file, expected_text): + data_dir = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "data", "table-order" + ) if "table-unordered.xml" == file: with pytest.raises(NotImplementedError): page_text(ET.parse(os.path.join(data_dir, file))) From 84d34f5b2671938cf2d17db998497bd21bebc9fc Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 24 Nov 2020 17:10:18 +0100 Subject: [PATCH 16/21] Fix annoying logging exceptions and encoding errors. --- qurator/dinglehopper/extracted_text.py | 11 ++++++----- qurator/dinglehopper/ocr_files.py | 2 +- qurator/dinglehopper/tests/extracted_text_test.py | 3 +++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py index 9703b6b..c779836 100644 --- a/qurator/dinglehopper/extracted_text.py +++ b/qurator/dinglehopper/extracted_text.py @@ -1,4 +1,5 @@ import enum +import logging import re import unicodedata from contextlib import suppress @@ -8,7 +9,8 @@ import attr import numpy as np from lxml import etree as ET -from ocrd_utils import getLogger + +LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate") class Normalization(enum.Enum): @@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str: def get_first_textequiv(textequivs, segment_id): """Get the first TextEquiv based on index or conf order if index is not present.""" - log = getLogger("processor.OcrdDinglehopperEvaluate") if len(textequivs) == 1: return textequivs[0] @@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id): nan_mask = np.isnan(indices) if np.any(~nan_mask): if np.any(nan_mask): - log.warning("TextEquiv without index in %s.", segment_id) + LOG.warning("TextEquiv without index in %s.", segment_id) index = np.nanargmin(indices) else: # try ordering by conf confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float) if np.any(~np.isnan(confidences)): - log.info( + LOG.info( "No index attributes, use 'conf' attribute to sort TextEquiv in %s.", segment_id, ) index = np.nanargmax(confidences) else: # fallback to first entry in case of neither index or conf present - log.warning("No index attributes, use first TextEquiv in %s.", segment_id) + LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id) index = 0 return textequivs[index] diff --git a/qurator/dinglehopper/ocr_files.py b/qurator/dinglehopper/ocr_files.py index 57ebd3f..6f2dd40 100644 --- a/qurator/dinglehopper/ocr_files.py +++ b/qurator/dinglehopper/ocr_files.py @@ -125,7 +125,7 @@ def page_text(tree, *, textequiv_level="region"): def plain_extract(filename): - with open(filename, "r") as f: + with open(filename, "r", encoding="utf8") as f: return ExtractedText( None, [ diff --git a/qurator/dinglehopper/tests/extracted_text_test.py b/qurator/dinglehopper/tests/extracted_text_test.py index 8a81587..c39b3a3 100644 --- a/qurator/dinglehopper/tests/extracted_text_test.py +++ b/qurator/dinglehopper/tests/extracted_text_test.py @@ -4,6 +4,7 @@ import pytest from lxml import etree as ET +from ocrd_utils import getLogger from uniseg.graphemecluster import grapheme_clusters from .. import seq_align, ExtractedText @@ -117,6 +118,7 @@ def test_align(): ) def test_textequiv(attributes, expected_index, expected_log, caplog): """Test that extracting text from a PAGE TextEquiv is working without index attr.""" + getLogger("processor.OcrdDinglehopperEvaluate") caplog.set_level(logging.INFO) xml = '' ns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2018-07-15" @@ -134,6 +136,7 @@ def test_textequiv(attributes, expected_index, expected_log, caplog): result = ExtractedText.from_text_segment( root, {"page": ns}, textequiv_level="line" ).text + if expected_index is None: assert not result else: From c4f75d526495cd39b0a89fb277ad1473edcc9f68 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 24 Nov 2020 17:10:59 +0100 Subject: [PATCH 17/21] Increase cache size for bad OCR results. --- .../dinglehopper/flexible_character_accuracy.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index f44c114..241ef4a 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -64,7 +64,7 @@ def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]: :return: Score between 0 and 1 and match objects. """ - best_score = -float("inf") + best_score = -sys.maxsize best_matches = [] # TODO: should this be configurable? combinations = product( @@ -135,7 +135,7 @@ def match_longest_gt_lines( :return: Possible match object. """ - best_score, best_match, best_gt, best_ocr = -float("inf"), None, None, None + best_score, best_match, best_gt, best_ocr = -sys.maxsize, None, None, None if not ocr_lines: return best_match @@ -145,7 +145,7 @@ def match_longest_gt_lines( length_threshold = min(gt_lines[0].length, ocr_lines[0].length) - 1 for gt_line in takewhile(lambda line: line.length > length_threshold, gt_lines): match, ocr_line = match_gt_line(gt_line, ocr_lines, coef) - score = -float("inf") if not match else character_accuracy(match.dist) + score = -sys.maxsize if not match else character_accuracy(match.dist) if score > best_score: best_score, best_match, best_gt, best_ocr = score, match, gt_line, ocr_line # early breaking: we only need one perfect fit @@ -171,7 +171,7 @@ def match_gt_line( :return: Match object and the matched ocr line. """ - min_penalty = float("inf") + min_penalty = sys.maxsize best_match, best_ocr = None, None gt_line_length = gt_line.length gt_line_start = gt_line.start @@ -193,7 +193,7 @@ def match_gt_line( return best_match, best_ocr -@lru_cache(maxsize=10000) +@lru_cache(maxsize=100000) def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: """Matches two lines searching for a naive local alignment. @@ -213,7 +213,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: if min_length == 0: return best_match length_diff = gt_line.length - ocr_line.length - min_edit_dist = float("inf") + min_edit_dist = sys.maxsize gt_parts = [ (i, gt_line.substring(rel_start=i, rel_end=i + min_length)) @@ -258,7 +258,7 @@ def match_lines(gt_line: "Part", ocr_line: "Part") -> Optional[Match]: return best_match -@lru_cache(maxsize=10000) +@lru_cache(maxsize=100000) def distance(gt: "Part", ocr: "Part") -> Match: """Calculate the editing distance between the two lines. @@ -285,7 +285,7 @@ def score_edit_distance(dist: Distance) -> int: return dist.delete + dist.insert + 2 * dist.replace -@lru_cache(10000) +@lru_cache(100000) def calculate_penalty( gt_length: int, ocr_length: int, From b9259b9d01eae9b34c7d214326a0c078a4a44e72 Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Thu, 26 Nov 2020 09:58:40 +0100 Subject: [PATCH 18/21] Add multiprocessing to flexible_character_accuracy --- .../flexible_character_accuracy.py | 68 +++++++++---------- .../tests/test_flexible_character_accuracy.py | 10 +-- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/qurator/dinglehopper/flexible_character_accuracy.py b/qurator/dinglehopper/flexible_character_accuracy.py index 241ef4a..4ace63c 100644 --- a/qurator/dinglehopper/flexible_character_accuracy.py +++ b/qurator/dinglehopper/flexible_character_accuracy.py @@ -13,12 +13,12 @@ import sys from collections import Counter -from functools import lru_cache, reduce +from functools import lru_cache, reduce, partial from itertools import product, takewhile -from typing import List, Tuple, Optional +from multiprocessing import cpu_count, get_context +from typing import List, Tuple, Optional, Union from Levenshtein import editops -from multimethod import multimethod from . import ExtractedText @@ -38,57 +38,57 @@ ) -@multimethod def flexible_character_accuracy( - gt: ExtractedText, ocr: ExtractedText + gt: Union[str, ExtractedText], + ocr: Union[str, ExtractedText], + n_cpu: int = cpu_count(), ) -> Tuple[float, List[Match]]: """Calculate the flexible character accuracy. Reference: contains steps 1-7 of the flexible character accuracy algorithm. - :param gt: The ground truth ExtractedText object. - :param ocr: The ExtractedText object to compare the ground truth with. - :return: Score between 0 and 1 and match objects. - """ - return flexible_character_accuracy(gt.text, ocr.text) - - -@multimethod -def flexible_character_accuracy(gt: str, ocr: str) -> Tuple[float, List[Match]]: - """Calculate the flexible character accuracy. - - Reference: contains steps 1-7 of the flexible character accuracy algorithm. - :param gt: The ground truth text. :param ocr: The text to compare the ground truth with. + :param n_cpu: numbers of cpus to use for multiprocessing. :return: Score between 0 and 1 and match objects. """ + if isinstance(gt, ExtractedText): + gt = gt.text + if isinstance(ocr, ExtractedText): + ocr = ocr.text + best_score = -sys.maxsize best_matches = [] # TODO: should this be configurable? - combinations = product( - range(15, 31, 5), range(0, 24, 3), range(0, 4, 1), range(0, 6, 1) - ) - # TODO: place to parallelize the algorithm? - for (edit_dist, length_diff, offset, length) in combinations: - coef = Coefficients( + coeffs = ( + Coefficients( edit_dist=edit_dist, length_diff=length_diff, offset=offset, length=length ) + for edit_dist, length_diff, offset, length in product( + range(15, 31, 5), range(0, 24, 3), range(0, 4, 1), range(0, 6, 1) + ) + ) + with get_context("spawn").Pool(processes=n_cpu) as pool: # Steps 1 - 6 of the flexible character accuracy algorithm. - matches = match_with_coefficients(gt, ocr, coef) - # Step 7 of the flexible character accuracy algorithm. - score = character_accuracy_for_matches(matches) - if score > best_score: - best_score = score - best_matches = matches - # early breaking: we only need one perfect fit - if best_score >= 1: - break + # We only use multiprocessing if we have more than 2 cpus available. + # Otherwise the overhead for creating processes and filling caches is too big. + map_fun = partial(pool.imap_unordered, chunksize=10) if n_cpu > 2 else map + for matches in map_fun( + partial(match_with_coefficients, gt=gt, ocr=ocr), coeffs + ): + # Step 7 of the flexible character accuracy algorithm. + score = character_accuracy_for_matches(matches) + if score > best_score: + best_score = score + best_matches = matches + # early breaking: we only need one perfect fit + if best_score >= 1: + break return best_score, best_matches -def match_with_coefficients(gt: str, ocr: str, coef: Coefficients) -> List[Match]: +def match_with_coefficients(coef: Coefficients, gt: str, ocr: str) -> List[Match]: """Match ground truth with ocr and consider a given set of coefficients. Reference: contains steps 1 - 6 of the flexible character accuracy algorithm. diff --git a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py index ad62798..6ef316b 100644 --- a/qurator/dinglehopper/tests/test_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_flexible_character_accuracy.py @@ -104,7 +104,7 @@ def extended_case_to_text(gt, ocr): @pytest.mark.parametrize(CASE_ARGS, [*SIMPLE_CASES, *COMPLEX_CASES]) def test_flexible_character_accuracy_str(gt, ocr, first_line_score, all_line_score): - score, _ = flexible_character_accuracy(gt, ocr) + score, _ = flexible_character_accuracy(gt, ocr, 1) assert score == pytest.approx(all_line_score) @@ -132,7 +132,7 @@ def get_extracted_text(text: str): gt_text = get_extracted_text(gt) ocr_text = get_extracted_text(ocr) - score, _ = flexible_character_accuracy(gt_text, ocr_text) + score, _ = flexible_character_accuracy(gt_text, ocr_text, 1) assert score == pytest.approx(all_line_score) @@ -186,7 +186,7 @@ def test_flexible_character_accuracy(config, ocr): ) expected_score = character_accuracy(expected_dist) - result, matches = flexible_character_accuracy(gt, ocr) + result, matches = flexible_character_accuracy(gt, ocr, 1) agg = reduce( lambda acc, match: acc + Counter(match.dist._asdict()), matches, Counter() ) @@ -201,7 +201,7 @@ def test_flexible_character_accuracy_extended( ): """Tests from figure 4 in the 10.1016/j.patrec.2020.02.003.""" gt_sentence, ocr_sentence = extended_case_to_text(gt, ocr) - result, _ = flexible_character_accuracy(gt_sentence, ocr_sentence) + result, _ = flexible_character_accuracy(gt_sentence, ocr_sentence, 1) assert result == pytest.approx(all_line_score, abs=0.001) @@ -210,7 +210,7 @@ def test_match_with_coefficients(gt, ocr, first_line_score, all_line_score): coef = Coefficients() if not isinstance(gt, str): gt, ocr = extended_case_to_text(gt, ocr) - matches = match_with_coefficients(gt, ocr, coef) + matches = match_with_coefficients(coef, gt, ocr) score = character_accuracy_for_matches(matches) assert score == pytest.approx(all_line_score, abs=0.001) From 9e64c4f0d0bc6f398f8d69560808e7b818ed563c Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Fri, 27 Nov 2020 11:31:25 +0100 Subject: [PATCH 19/21] Remove obsolete test --- .../test_integ_flexible_character_accuracy.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py index 3b3ecba..f7299bd 100644 --- a/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py +++ b/qurator/dinglehopper/tests/test_integ_flexible_character_accuracy.py @@ -30,28 +30,6 @@ def test_fac_ignoring_reading_order(file): assert "".join(ocr_segments) == expected -@pytest.mark.parametrize( - "file,expected_text", - [ - ("table-order-0001.xml", "1\n2\n3\n4\n5\n6\n7\n8\n9"), - ("table-order-0002.xml", "1\n4\n7\n2\n5\n8\n3\n6\n9"), - ("table-no-reading-order.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"), - ("table-unordered.xml", "5\n6\n7\n8\n9\n1\n2\n3\n4"), - ], -) -@pytest.mark.integration -def test_reading_order_settings(file, expected_text): - data_dir = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "data", "table-order" - ) - if "table-unordered.xml" == file: - with pytest.raises(NotImplementedError): - page_text(ET.parse(os.path.join(data_dir, file))) - else: - ocr = page_text(ET.parse(os.path.join(data_dir, file))) - assert ocr == expected_text - - @pytest.mark.integration @pytest.mark.parametrize( "gt,ocr,expected", From 85b784f9a1578b8a8af162c6b753937c6daded0f Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Tue, 16 Feb 2021 11:23:37 +0100 Subject: [PATCH 20/21] Fix problem with json encoding --- qurator/dinglehopper/cli.py | 7 ++++++- qurator/dinglehopper/templates/report.json.j2 | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py index 9a2a837..c9b347f 100644 --- a/qurator/dinglehopper/cli.py +++ b/qurator/dinglehopper/cli.py @@ -1,3 +1,4 @@ +import json import os import click @@ -55,7 +56,10 @@ def format_thing(t, css_classes=None, id_=None): pos = 0 for ocr_line in ocr_lines_sorted: if ocr_line.line not in ocr_line_region_id.keys(): - ocr_line_region_id[ocr_line.line] = ocr_in.segment_id_for_pos(pos) + try: + ocr_line_region_id[ocr_line.line] = ocr_in.segment_id_for_pos(pos) + except AssertionError: + pass pos += ocr_line.length ocr_ids = {None: None} @@ -159,6 +163,7 @@ def json_float(value): ) ) env.filters["json_float"] = json_float + env.filters["json_dumps"] = json.dumps for report_suffix in (".html", ".json"): template_fn = "report" + report_suffix + ".j2" diff --git a/qurator/dinglehopper/templates/report.json.j2 b/qurator/dinglehopper/templates/report.json.j2 index 161d342..a632590 100644 --- a/qurator/dinglehopper/templates/report.json.j2 +++ b/qurator/dinglehopper/templates/report.json.j2 @@ -6,6 +6,6 @@ {% if n_characters is not none %}"n_characters": {{ n_characters }},{% endif %} {% if n_words is not none %}"n_words": {{ n_words }},{% endif %} {% endif %} - "gt": "{{ gt }}", - "ocr": "{{ ocr }}" + "gt": {{ gt|json_dumps }}, + "ocr": {{ ocr|json_dumps }} } From 675a096dfe52ea5ba248323f596f1c5152148dca Mon Sep 17 00:00:00 2001 From: Benjamin Rosemann Date: Wed, 19 May 2021 15:02:49 +0200 Subject: [PATCH 21/21] Remove restrictions on numpy --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1ebf4f4..99172c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ click jinja2 lxml uniseg -numpy<1.19.0 +numpy colorama MarkupSafe ocrd >= 2.20.1