qurator-spk · b2m · Nov 9, 2020 · Nov 10, 2020 · Nov 10, 2020 · Nov 10, 2020
diff --git a/README.md b/README.md
@@ -35,19 +35,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
   their text and falls back to plain text if no ALTO or PAGE is detected.
 
   The files GT and OCR are usually a ground truth document and the result of
-  an OCR software, but you may use dinglehopper to compare two OCR results.
-  In that case, use --no-metrics to disable the then meaningless metrics and
-  also change the color scheme from green/red to blue.
+  an OCR software, but you may use dinglehopper to compare two OCR results. In
+  that case, use --metrics='' to disable the then meaningless metrics and also
+  change the color scheme from green/red to blue.
 
   The comparison report will be written to $REPORT_PREFIX.{html,json}, where
-  $REPORT_PREFIX defaults to "report". The reports include the character
-  error rate (CER) and the word error rate (WER).
+  $REPORT_PREFIX defaults to "report". Depending on your configuration the
+  reports include the character error rate (CER), the word error rate (WER)
+  and the flexible character accuracy (FCA).
+
+  The metrics can be chosen via a comma separated combination of their acronyms
+  like "--metrics=cer,wer,fca".
 
   By default, the text of PAGE files is extracted on 'region' level. You may
   use "--textequiv-level line" to extract from the level of TextLine tags.
 
 Options:
-  --metrics / --no-metrics  Enable/disable metrics and green/red
+  --metrics                 Enable different metrics like cer, wer and fca.
   --textequiv-level LEVEL   PAGE TextEquiv level to extract text from
   --progress                Show progress bar
   --help                    Show this message and exit.
@@ -80,12 +84,12 @@ The OCR-D processor has these parameters:
 
 | Parameter                 | Meaning                                                             |
 | ------------------------- | ------------------------------------------------------------------- |
-| `-P metrics false`        | Disable metrics and the green-red color scheme (default: enabled)   |
+| `-P metrics cer,wer`      | Enable character error rate and word error rate (default)           |
 | `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) |
 
 For example:
 ~~~
-ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false
+ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer
 ~~~
 
 Developer information

diff --git a/qurator/dinglehopper/__init__.py b/qurator/dinglehopper/__init__.py
@@ -3,3 +3,8 @@
 from .character_error_rate import *
 from .word_error_rate import *
 from .align import *
+from .flexible_character_accuracy import (
+    flexible_character_accuracy,
+    split_matches,
+    Match,
+)
diff --git a/qurator/dinglehopper/align.py b/qurator/dinglehopper/align.py
@@ -8,11 +8,20 @@ def align(t1, t2):
     return seq_align(s1, s2)
 
 
-def seq_align(s1, s2):
+def seq_align_linewise(s1, s2, ops):
+    """Align two lists of lines linewise."""
+    assert len(s1) == len(s2)
+    assert len(s2) == len(ops)
+    for l1, l2, line_ops in zip(s1, s2, ops):
+        yield from seq_align(l1, l2, ops=line_ops)
+
+
+def seq_align(s1, s2, ops=None):
     """Align general sequences."""
     s1 = list(s1)
     s2 = list(s2)
-    ops = seq_editops(s1, s2)
+    if not ops:
+        ops = seq_editops(s1, s2)
     i = 0
     j = 0
 

diff --git a/qurator/dinglehopper/cli.py b/qurator/dinglehopper/cli.py
@@ -1,3 +1,4 @@
+import json
 import os
 
 import click
@@ -6,14 +7,15 @@
 from uniseg.graphemecluster import grapheme_clusters
 
 from .character_error_rate import character_error_rate_n
+from .flexible_character_accuracy import flexible_character_accuracy, split_matches
 from .word_error_rate import word_error_rate_n, words_normalized
-from .align import seq_align
+from .align import seq_align, seq_align_linewise
 from .extracted_text import ExtractedText
 from .ocr_files import extract
 from .config import Config
 
 
-def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
+def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
     gtx = ""
     ocrx = ""
 
@@ -41,7 +43,32 @@ def format_thing(t, css_classes=None, id_=None):
         else:
             return "{html_t}".format(html_t=html_t)
 
-    if isinstance(gt_in, ExtractedText):
+    ops, ocr_ids = None, None
+    seq_align_fun = seq_align
+    if matches:
+        seq_align_fun = seq_align_linewise
+        gt_things, ocr_things, ops = split_matches(matches)
+        # we have to reconstruct the order of the ocr because we mixed it for fca
+        ocr_lines = [match.ocr for match in matches]
+        ocr_lines_sorted = sorted(ocr_lines, key=lambda x: x.line + x.start / 10000)
+
+        ocr_line_region_id = {}
+        pos = 0
+        for ocr_line in ocr_lines_sorted:
+            if ocr_line.line not in ocr_line_region_id.keys():
+                try:
+                    ocr_line_region_id[ocr_line.line] = ocr_in.segment_id_for_pos(pos)
+                except AssertionError:
+                    pass
+            pos += ocr_line.length
+
+        ocr_ids = {None: None}
+        pos = 0
+        for ocr_line in ocr_lines:
+            for _ in ocr_line.text:
+                ocr_ids[pos] = ocr_line_region_id[ocr_line.line]
+                pos += 1
+    elif isinstance(gt_in, ExtractedText):
         if not isinstance(ocr_in, ExtractedText):
             raise TypeError()
         # XXX splitting should be done in ExtractedText
@@ -53,17 +80,20 @@ def format_thing(t, css_classes=None, id_=None):
 
     g_pos = 0
     o_pos = 0
-    for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
+    for k, (g, o) in enumerate(seq_align_fun(gt_things, ocr_things, ops=ops)):
         css_classes = None
         gt_id = None
         ocr_id = None
         if g != o:
             css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k)
             if isinstance(gt_in, ExtractedText):
-                gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
-                ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
                 # Deletions and inserts only produce one id + None, UI must
                 # support this, i.e. display for the one id produced
+                gt_id = gt_in.segment_id_for_pos(g_pos) if g else None
+                if ocr_ids:
+                    ocr_id = ocr_ids.get(o_pos, None)
+                else:
+                    ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None
 
         gtx += joiner + format_thing(g, css_classes, gt_id)
         ocrx += joiner + format_thing(o, css_classes, ocr_id)
@@ -83,28 +113,37 @@ def format_thing(t, css_classes=None, id_=None):
     )
 
 
-def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
+def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"):
     """Check OCR result against GT.
 
-    The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
-    Click on a wrapper.
+    The @click decorators change the signature of the decorated functions,
+    so we keep this undecorated version and use Click on a wrapper.
     """
+    cer, char_diff_report, n_characters = None, None, None
+    wer, word_diff_report, n_words = None, None, None
+    fca, fca_diff_report = None, None
 
     gt_text = extract(gt, textequiv_level=textequiv_level)
     ocr_text = extract(ocr, textequiv_level=textequiv_level)
 
-    cer, n_characters = character_error_rate_n(gt_text, ocr_text)
-    wer, n_words = word_error_rate_n(gt_text, ocr_text)
-
-    char_diff_report = gen_diff_report(
-        gt_text, ocr_text, css_prefix="c", joiner="", none="·"
-    )
+    if "cer" in metrics or not metrics:
+        cer, n_characters = character_error_rate_n(gt_text, ocr_text)
+        char_diff_report = gen_diff_report(
+            gt_text, ocr_text, css_prefix="c", joiner="", none="·"
+        )
 
-    gt_words = words_normalized(gt_text)
-    ocr_words = words_normalized(ocr_text)
-    word_diff_report = gen_diff_report(
-        gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
-    )
+    if "wer" in metrics:
+        gt_words = words_normalized(gt_text)
+        ocr_words = words_normalized(ocr_text)
+        wer, n_words = word_error_rate_n(gt_text, ocr_text)
+        word_diff_report = gen_diff_report(
+            gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
+        )
+    if "fca" in metrics:
+        fca, fca_matches = flexible_character_accuracy(gt_text, ocr_text)
+        fca_diff_report = gen_diff_report(
+            gt_text, ocr_text, css_prefix="c", joiner="", none="·", matches=fca_matches
+        )
 
     def json_float(value):
         """Convert a float value to an JSON float.
@@ -124,6 +163,7 @@ def json_float(value):
         )
     )
     env.filters["json_float"] = json_float
+    env.filters["json_dumps"] = json.dumps
 
     for report_suffix in (".html", ".json"):
         template_fn = "report" + report_suffix + ".j2"
@@ -137,8 +177,10 @@ def json_float(value):
             n_characters=n_characters,
             wer=wer,
             n_words=n_words,
+            fca=fca,
             char_diff_report=char_diff_report,
             word_diff_report=word_diff_report,
+            fca_diff_report=fca_diff_report,
             metrics=metrics,
         ).dump(out_fn)
 
@@ -148,7 +190,9 @@ def json_float(value):
 @click.argument("ocr", type=click.Path(exists=True))
 @click.argument("report_prefix", type=click.Path(), default="report")
 @click.option(
-    "--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
+    "--metrics",
+    default="cer,wer",
+    help="Enable different metrics like cer, wer and fca.",
 )
 @click.option(
     "--textequiv-level",
@@ -166,12 +210,16 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):
 
     The files GT and OCR are usually a ground truth document and the result of
     an OCR software, but you may use dinglehopper to compare two OCR results. In
-    that case, use --no-metrics to disable the then meaningless metrics and also
+    that case, use --metrics='' to disable the then meaningless metrics and also
     change the color scheme from green/red to blue.
 
     The comparison report will be written to $REPORT_PREFIX.{html,json}, where
-    $REPORT_PREFIX defaults to "report". The reports include the character error
-    rate (CER) and the word error rate (WER).
+    $REPORT_PREFIX defaults to "report". Depending on your configuration the
+    reports include the character error rate (CER), the word error rate (WER)
+    and the flexible character accuracy (FCA).
+
+    The metrics can be chosen via a comma separated combination of their acronyms
+    like "--metrics=cer,wer,fca".
 
     By default, the text of PAGE files is extracted on 'region' level. You may
     use "--textequiv-level line" to extract from the level of TextLine tags.

diff --git a/qurator/dinglehopper/extracted_text.py b/qurator/dinglehopper/extracted_text.py
@@ -1,4 +1,5 @@
 import enum
+import logging
 import re
 import unicodedata
 from contextlib import suppress
@@ -8,7 +9,8 @@
 import attr
 import numpy as np
 from lxml import etree as ET
-from ocrd_utils import getLogger
+
+LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate")
 
 
 class Normalization(enum.Enum):
@@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str:
 
 def get_first_textequiv(textequivs, segment_id):
     """Get the first TextEquiv based on index or conf order if index is not present."""
-    log = getLogger("processor.OcrdDinglehopperEvaluate")
     if len(textequivs) == 1:
         return textequivs[0]
 
@@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id):
     nan_mask = np.isnan(indices)
     if np.any(~nan_mask):
         if np.any(nan_mask):
-            log.warning("TextEquiv without index in %s.", segment_id)
+            LOG.warning("TextEquiv without index in %s.", segment_id)
         index = np.nanargmin(indices)
     else:
         # try ordering by conf
         confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
         if np.any(~np.isnan(confidences)):
-            log.info(
+            LOG.info(
                 "No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
                 segment_id,
             )
             index = np.nanargmax(confidences)
         else:
             # fallback to first entry in case of neither index or conf present
-            log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
+            LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
             index = 0
     return textequivs[index]