Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add flexible character accuracy #47

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,23 @@ Usage: dinglehopper [OPTIONS] GT OCR [REPORT_PREFIX]
their text and falls back to plain text if no ALTO or PAGE is detected.

The files GT and OCR are usually a ground truth document and the result of
an OCR software, but you may use dinglehopper to compare two OCR results.
In that case, use --no-metrics to disable the then meaningless metrics and
also change the color scheme from green/red to blue.
an OCR software, but you may use dinglehopper to compare two OCR results. In
that case, use --metrics='' to disable the then meaningless metrics and also
change the color scheme from green/red to blue.

The comparison report will be written to $REPORT_PREFIX.{html,json}, where
$REPORT_PREFIX defaults to "report". The reports include the character
error rate (CER) and the word error rate (WER).
$REPORT_PREFIX defaults to "report". Depending on your configuration the
reports include the character error rate (CER), the word error rate (WER)
and the flexible character accuracy (FCA).

The metrics can be chosen via a comma separated combination of their acronyms
like "--metrics=cer,wer,fca".

By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.

Options:
--metrics / --no-metrics Enable/disable metrics and green/red
--metrics Enable different metrics like cer, wer and fca.
--textequiv-level LEVEL PAGE TextEquiv level to extract text from
--progress Show progress bar
--help Show this message and exit.
Expand Down Expand Up @@ -80,12 +84,12 @@ The OCR-D processor has these parameters:

| Parameter | Meaning |
| ------------------------- | ------------------------------------------------------------------- |
| `-P metrics false` | Disable metrics and the green-red color scheme (default: enabled) |
| `-P metrics cer,wer` | Enable character error rate and word error rate (default) |
| `-P textequiv_level line` | (PAGE) Extract text from TextLine level (default: TextRegion level) |

For example:
~~~
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics false
ocrd-dinglehopper -I ABBYY-FULLTEXT,OCR-D-OCR-CALAMARI -O OCR-D-OCR-COMPARE-ABBYY-CALAMARI -P metrics cer,wer
~~~

Developer information
Expand Down
5 changes: 5 additions & 0 deletions qurator/dinglehopper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,8 @@
from .character_error_rate import *
from .word_error_rate import *
from .align import *
from .flexible_character_accuracy import (
flexible_character_accuracy,
split_matches,
Match,
)
13 changes: 11 additions & 2 deletions qurator/dinglehopper/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,20 @@ def align(t1, t2):
return seq_align(s1, s2)


def seq_align(s1, s2):
def seq_align_linewise(s1, s2, ops):
"""Align two lists of lines linewise."""
assert len(s1) == len(s2)
assert len(s2) == len(ops)
for l1, l2, line_ops in zip(s1, s2, ops):
yield from seq_align(l1, l2, ops=line_ops)


def seq_align(s1, s2, ops=None):
"""Align general sequences."""
s1 = list(s1)
s2 = list(s2)
ops = seq_editops(s1, s2)
if not ops:
ops = seq_editops(s1, s2)
i = 0
j = 0

Expand Down
96 changes: 72 additions & 24 deletions qurator/dinglehopper/cli.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os

import click
Expand All @@ -6,14 +7,15 @@
from uniseg.graphemecluster import grapheme_clusters

from .character_error_rate import character_error_rate_n
from .flexible_character_accuracy import flexible_character_accuracy, split_matches
from .word_error_rate import word_error_rate_n, words_normalized
from .align import seq_align
from .align import seq_align, seq_align_linewise
from .extracted_text import ExtractedText
from .ocr_files import extract
from .config import Config


def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none):
def gen_diff_report(gt_in, ocr_in, css_prefix, joiner, none, matches=None):
gtx = ""
ocrx = ""

Expand Down Expand Up @@ -41,7 +43,32 @@ def format_thing(t, css_classes=None, id_=None):
else:
return "{html_t}".format(html_t=html_t)

if isinstance(gt_in, ExtractedText):
ops, ocr_ids = None, None
seq_align_fun = seq_align
if matches:
seq_align_fun = seq_align_linewise
gt_things, ocr_things, ops = split_matches(matches)
# we have to reconstruct the order of the ocr because we mixed it for fca
ocr_lines = [match.ocr for match in matches]
ocr_lines_sorted = sorted(ocr_lines, key=lambda x: x.line + x.start / 10000)

ocr_line_region_id = {}
pos = 0
for ocr_line in ocr_lines_sorted:
if ocr_line.line not in ocr_line_region_id.keys():
try:
ocr_line_region_id[ocr_line.line] = ocr_in.segment_id_for_pos(pos)
except AssertionError:
pass
pos += ocr_line.length

ocr_ids = {None: None}
pos = 0
for ocr_line in ocr_lines:
for _ in ocr_line.text:
ocr_ids[pos] = ocr_line_region_id[ocr_line.line]
pos += 1
elif isinstance(gt_in, ExtractedText):
if not isinstance(ocr_in, ExtractedText):
raise TypeError()
# XXX splitting should be done in ExtractedText
Expand All @@ -53,17 +80,20 @@ def format_thing(t, css_classes=None, id_=None):

g_pos = 0
o_pos = 0
for k, (g, o) in enumerate(seq_align(gt_things, ocr_things)):
for k, (g, o) in enumerate(seq_align_fun(gt_things, ocr_things, ops=ops)):
css_classes = None
gt_id = None
ocr_id = None
if g != o:
css_classes = "{css_prefix}diff{k} diff".format(css_prefix=css_prefix, k=k)
if isinstance(gt_in, ExtractedText):
gt_id = gt_in.segment_id_for_pos(g_pos) if g is not None else None
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o is not None else None
# Deletions and inserts only produce one id + None, UI must
# support this, i.e. display for the one id produced
gt_id = gt_in.segment_id_for_pos(g_pos) if g else None
if ocr_ids:
ocr_id = ocr_ids.get(o_pos, None)
else:
ocr_id = ocr_in.segment_id_for_pos(o_pos) if o else None

gtx += joiner + format_thing(g, css_classes, gt_id)
ocrx += joiner + format_thing(o, css_classes, ocr_id)
Expand All @@ -83,28 +113,37 @@ def format_thing(t, css_classes=None, id_=None):
)


def process(gt, ocr, report_prefix, *, metrics=True, textequiv_level="region"):
def process(gt, ocr, report_prefix, *, metrics="cer,wer", textequiv_level="region"):
"""Check OCR result against GT.

The @click decorators change the signature of the decorated functions, so we keep this undecorated version and use
Click on a wrapper.
The @click decorators change the signature of the decorated functions,
so we keep this undecorated version and use Click on a wrapper.
"""
cer, char_diff_report, n_characters = None, None, None
wer, word_diff_report, n_words = None, None, None
fca, fca_diff_report = None, None

gt_text = extract(gt, textequiv_level=textequiv_level)
ocr_text = extract(ocr, textequiv_level=textequiv_level)

cer, n_characters = character_error_rate_n(gt_text, ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text)

char_diff_report = gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·"
)
if "cer" in metrics or not metrics:
cer, n_characters = character_error_rate_n(gt_text, ocr_text)
char_diff_report = gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·"
)

gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text)
word_diff_report = gen_diff_report(
gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
)
if "wer" in metrics:
gt_words = words_normalized(gt_text)
ocr_words = words_normalized(ocr_text)
wer, n_words = word_error_rate_n(gt_text, ocr_text)
word_diff_report = gen_diff_report(
gt_words, ocr_words, css_prefix="w", joiner=" ", none="⋯"
)
if "fca" in metrics:
fca, fca_matches = flexible_character_accuracy(gt_text, ocr_text)
fca_diff_report = gen_diff_report(
gt_text, ocr_text, css_prefix="c", joiner="", none="·", matches=fca_matches
)

def json_float(value):
"""Convert a float value to an JSON float.
Expand All @@ -124,6 +163,7 @@ def json_float(value):
)
)
env.filters["json_float"] = json_float
env.filters["json_dumps"] = json.dumps

for report_suffix in (".html", ".json"):
template_fn = "report" + report_suffix + ".j2"
Expand All @@ -137,8 +177,10 @@ def json_float(value):
n_characters=n_characters,
wer=wer,
n_words=n_words,
fca=fca,
char_diff_report=char_diff_report,
word_diff_report=word_diff_report,
fca_diff_report=fca_diff_report,
metrics=metrics,
).dump(out_fn)

Expand All @@ -148,7 +190,9 @@ def json_float(value):
@click.argument("ocr", type=click.Path(exists=True))
@click.argument("report_prefix", type=click.Path(), default="report")
@click.option(
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
"--metrics",
default="cer,wer",
help="Enable different metrics like cer, wer and fca.",
)
@click.option(
"--textequiv-level",
Expand All @@ -166,12 +210,16 @@ def main(gt, ocr, report_prefix, metrics, textequiv_level, progress):

The files GT and OCR are usually a ground truth document and the result of
an OCR software, but you may use dinglehopper to compare two OCR results. In
that case, use --no-metrics to disable the then meaningless metrics and also
that case, use --metrics='' to disable the then meaningless metrics and also
change the color scheme from green/red to blue.

The comparison report will be written to $REPORT_PREFIX.{html,json}, where
$REPORT_PREFIX defaults to "report". The reports include the character error
rate (CER) and the word error rate (WER).
$REPORT_PREFIX defaults to "report". Depending on your configuration the
reports include the character error rate (CER), the word error rate (WER)
and the flexible character accuracy (FCA).

The metrics can be chosen via a comma separated combination of their acronyms
like "--metrics=cer,wer,fca".

By default, the text of PAGE files is extracted on 'region' level. You may
use "--textequiv-level line" to extract from the level of TextLine tags.
Expand Down
11 changes: 6 additions & 5 deletions qurator/dinglehopper/extracted_text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import enum
import logging
import re
import unicodedata
from contextlib import suppress
Expand All @@ -8,7 +9,8 @@
import attr
import numpy as np
from lxml import etree as ET
from ocrd_utils import getLogger

LOG = logging.getLogger("processor.OcrdDinglehopperEvaluate")


class Normalization(enum.Enum):
Expand Down Expand Up @@ -239,7 +241,6 @@ def get_textequiv_unicode(text_segment, nsmap) -> str:

def get_first_textequiv(textequivs, segment_id):
"""Get the first TextEquiv based on index or conf order if index is not present."""
log = getLogger("processor.OcrdDinglehopperEvaluate")
if len(textequivs) == 1:
return textequivs[0]

Expand All @@ -248,20 +249,20 @@ def get_first_textequiv(textequivs, segment_id):
nan_mask = np.isnan(indices)
if np.any(~nan_mask):
if np.any(nan_mask):
log.warning("TextEquiv without index in %s.", segment_id)
LOG.warning("TextEquiv without index in %s.", segment_id)
index = np.nanargmin(indices)
else:
# try ordering by conf
confidences = np.array([get_attr(te, "conf") for te in textequivs], dtype=float)
if np.any(~np.isnan(confidences)):
log.info(
LOG.info(
"No index attributes, use 'conf' attribute to sort TextEquiv in %s.",
segment_id,
)
index = np.nanargmax(confidences)
else:
# fallback to first entry in case of neither index or conf present
log.warning("No index attributes, use first TextEquiv in %s.", segment_id)
LOG.warning("No index attributes, use first TextEquiv in %s.", segment_id)
index = 0
return textequivs[index]

Expand Down
Loading