Merge pull request #35 from bertsky/overwrite-lines

improve consistency and quality of results
OCR-D · May 20, 2019 · 2e5778d · 2e5778d
2 parents 8bd69a6 + 56e8f1d
commit 2e5778d
Show file tree

Hide file tree

Showing 16 changed files with 156 additions and 46 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -13,7 +13,8 @@ env:
     matrix:
         # - TESSERACT=3.04.01-1
         - TESSERACT=3.05.02-3
-        - TESSERACT=4.0.0-beta.4-2
+        # - TESSERACT=4.0.0-beta.4-2
+        - TESSERACT=4.0.0-1
 
 before_install:
     # - export TESSERACT_INSTALL=$HOME/.tesseract

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,20 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+## [0.2.2] - 2019-05-16
+
+Changed:
+
+  * Add simple page cropping processor crop
+  * Respect border cropping in segment_word
+  * Add parameter overwrite_words in recognize
+  * Make higher TextEquivs consistent after recognize
+
+Fixed:
+
+  * Remove invalid @externalRef from MetadataItem
+  * Retain pageId in output (i.e. link to structMap)
+
 ## [0.2.1] - 2019-02-28
 
 Fixed:

diff --git a/Makefile b/Makefile
@@ -39,17 +39,22 @@ help:
 # (lib*-dev merely for building tesserocr with pip)
 # (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0,
 #  but some beta, not the final release,
-#  on which tesserocr 2.4.0 depends,
-#  but we can instead downgrade to 2.3.1;
-#  in the future, however, we might require
-#  a tesseract build from git)
+#  on which tesserocr 2.4.0 depends;
+#  this downloads a tesseract build from git
+#  and installs it system-wide -
+#  intended for dockerfile and travis,
+#  not recommended for live systems!)
 deps-ubuntu:
 	sudo apt-get install -y \
 		libxml2-utils \
 		libimage-exiftool-perl \
 		libtesseract-dev \
 		libleptonica-dev \
-		tesseract-ocr tesseract-ocr-eng
+		tesseract-ocr-eng \
+		tesseract-ocr \
+		wget
+	wget -O - https://github.com/nijel/tesseract-ocr-build/releases/download/4.0.0-1/tesseract.tar.xz | tar -xJf -
+	sudo cp -rt /usr .tesseract/*
 
 # Install python deps via pip
 deps:

diff --git a/ocrd_tesserocr/__init__.py b/ocrd_tesserocr/__init__.py
@@ -1,4 +1,13 @@
+import locale
+# circumvent tesseract-ocr issue #1670
+# (which cannot be done on command line
+# because Click requires an UTF-8 locale
+# in Python 3):
+# pylint: disable=wrong-import-position
+locale.setlocale(locale.LC_ALL, 'C.UTF-8')
+
 from .recognize import TesserocrRecognize
+from .segment_word import TesserocrSegmentWord
 from .segment_line import TesserocrSegmentLine
 from .segment_region import TesserocrSegmentRegion
 from .crop import TesserocrCrop
diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py
@@ -76,6 +76,7 @@ def process(self):
                 self.workspace.add_file(
                     ID=ID,
                     file_grp=self.output_file_grp,
+                    pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
                     local_filename='%s/%s' % (self.output_file_grp, ID),
                     content=to_xml(pcgts).encode('utf-8'),

diff --git a/ocrd_tesserocr/ocrd-tool.json b/ocrd_tesserocr/ocrd-tool.json
@@ -24,6 +24,11 @@
           "default": "line",
           "description": "PAGE XML hierarchy level to add the TextEquiv results to (requires existing layout annotation up to one level above that)"
         },
+        "overwrite_words": {
+          "type": "boolean",
+          "default": false,
+          "description": "remove existing layout and text annotation below the TextLine level (regardless of textequiv_level)"
+        },
         "model": {
           "type": "string",
           "description": "tessdata model to apply (an ISO 639-3 language specification or some other basename, e.g. deu-frak or Fraktur)"

diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py
@@ -1,25 +1,23 @@
 from __future__ import absolute_import
+import math
 
-import locale
+from tesserocr import (
+    RIL, PSM,
+    PyTessBaseAPI, get_languages,
+    Orientation, TextlineOrder, WritingDirection)
 
-# pylint: disable=wrong-import-position
-locale.setlocale(locale.LC_ALL, 'C') # circumvent tesseract-ocr issue 1670 (which cannot be done on command line because Click requires an UTF-8 locale in Python 3)
-
-from tesserocr import RIL, PSM, PyTessBaseAPI, get_languages
-
-from ocrd_utils import getLogger, concat_padded, xywh_from_points, points_from_x0y0x1y1, MIMETYPE_PAGE
-from ocrd_modelfactory import page_from_file
+from ocrd_utils import (
+    getLogger, concat_padded,
+    polygon_from_points, xywh_from_points, points_from_x0y0x1y1,
+    MIMETYPE_PAGE)
 from ocrd_models.ocrd_page import (
     CoordsType,
-    GlyphType,
-    LabelType,
-    LabelsType,
+    GlyphType, WordType,
+    LabelType, LabelsType,
     MetadataItemType,
-    TextEquivType,
-    TextStyleType,
-
-    to_xml
-)
+    TextEquivType, TextStyleType,
+    to_xml)
+from ocrd_modelfactory import page_from_file
 from ocrd import Processor
 from .config import TESSDATA_PREFIX, OCRD_TOOL
 
@@ -37,10 +35,20 @@ def __init__(self, *args, **kwargs):
         super(TesserocrRecognize, self).__init__(*args, **kwargs)
 
     def process(self):
+        """Perform OCR recognition with Tesseract on the workspace.
+        
+        Open and deserialise PAGE input files and their respective images, 
+        then iterate over the element hierarchy down to the requested
+        `textequiv_level`. If `overwrite_words` is enabled and any layout
+        annotation below the line level already exists, then remove it
+        (regardless of `textequiv_level`).
+        Set up Tesseract to recognise each segment's image rectangle with
+        the appropriate mode and `model`. Create new elements below the line
+        level if necessary. Put text results and confidence values into new
+        TextEquiv at `textequiv_level`, and make the higher levels consistent
+        with that (by concatenation joined by whitespace). Produce new output
+        files by serialising the resulting hierarchy.
         """
-        Performs the (text) recognition.
-        """
-        # print(self.parameter)
         log.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages())
         maxlevel = self.parameter['textequiv_level']
         model = get_languages()[1][-1] # last installed model
@@ -108,7 +116,9 @@ def process(self):
                     MetadataItemType(type_="processingStep",
                                      name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize']['steps'][0],
                                      value='ocrd-tesserocr-recognize',
-                                     Labels=[LabelsType(externalRef="parameters",
+                                     # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
+                                     # what we want here is `externalModel="ocrd-tool" externalId="parameters"`
+                                     Labels=[LabelsType(#externalRef="parameters",
                                                         Label=[LabelType(type_=name,
                                                                          value=self.parameter[name])
                                                                for name in self.parameter.keys()])]))
@@ -117,10 +127,12 @@ def process(self):
                 if not regions:
                     log.warning("Page contains no text regions")
                 self._process_regions(regions, maxlevel, tessapi)
+                page_update_higher_textequiv_levels(maxlevel, pcgts)
                 ID = concat_padded(self.output_file_grp, n)
                 self.workspace.add_file(
                     ID=ID,
                     file_grp=self.output_file_grp,
+                    pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
                     local_filename='%s/%s' % (self.output_file_grp, ID),
                     content=to_xml(pcgts),
@@ -167,6 +179,8 @@ def _process_regions(self, regions, maxlevel, tessapi):
 
     def _process_lines(self, textlines, maxlevel, tessapi):
         for line in textlines:
+            if self.parameter['overwrite_words']:
+                line.set_Word([])
             log.debug("Recognizing text in line '%s'", line.id)
             line_xywh = xywh_from_points(line.get_Coords().points)
             #  log.debug("xywh: %s", line_xywh)
@@ -198,7 +212,7 @@ def _process_words_in_line(self, line, maxlevel, result_it):
                 log.error("No iterator at '%s'", line.id)
                 break
             if result_it.Empty(RIL.WORD):
-                log.debug("No word here")
+                log.warning("No word in line '%s'", line.id)
                 break
             word_id = '%s_word%04d' % (line.id, word_no)
             log.debug("Recognizing text in word '%s'", word_id)
@@ -313,3 +327,36 @@ def _process_glyphs_in_word(self, word, result_it):
                 break
             else:
                 result_it.Next(RIL.SYMBOL)
+
+def page_update_higher_textequiv_levels(level, pcgts):
+    '''Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
+    
+    Starting with the hierarchy level chosen for processing,
+    join all first TextEquiv (by the rules governing the respective level)
+    into TextEquiv of the next higher level, replacing them.
+    '''
+    regions = pcgts.get_Page().get_TextRegion()
+    if level != 'region':
+        for region in regions:
+            lines = region.get_TextLine()
+            if level != 'line':
+                for line in lines:
+                    words = line.get_Word()
+                    if level != 'word':
+                        for word in words:
+                            glyphs = word.get_Glyph()
+                            word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode
+                                                    if glyph.get_TextEquiv()
+                                                    else u'' for glyph in glyphs)
+                            word.set_TextEquiv(
+                                [TextEquivType(Unicode=word_unicode)]) # remove old
+                    line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode
+                                             if word.get_TextEquiv()
+                                             else u'' for word in words)
+                    line.set_TextEquiv(
+                        [TextEquivType(Unicode=line_unicode)]) # remove old
+            region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
+                                        if line.get_TextEquiv()
+                                        else u'' for line in lines)
+            region.set_TextEquiv(
+                [TextEquivType(Unicode=region_unicode)]) # remove old
diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py
@@ -46,6 +46,7 @@ def process(self):
                 self.workspace.add_file(
                     ID=ID,
                     file_grp=self.output_file_grp,
+                    pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
                     local_filename='%s/%s' % (self.output_file_grp, ID),
                     content=to_xml(pcgts).encode('utf-8'),

diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py
@@ -66,6 +66,7 @@ def process(self):
                 self.workspace.add_file(
                     ID=ID,
                     file_grp=self.output_file_grp,
+                    pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
                     local_filename='%s/%s' % (self.output_file_grp, ID),
                     content=to_xml(pcgts).encode('utf-8'),

diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py
@@ -47,6 +47,7 @@ def process(self):
                 self.workspace.add_file(
                     ID=ID,
                     file_grp=self.output_file_grp,
+                    pageId=input_file.pageId,
                     local_filename='%s/%s' % (self.output_file_grp, ID),
                     mimetype=MIMETYPE_PAGE,
                     content=to_xml(pcgts).encode('utf-8'),

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 ocrd >= 1.0.0b5
 click
-tesserocr == 2.3.1
+tesserocr >= 2.3.1
diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
 
 setup(
     name='ocrd_tesserocr',
-    version='0.2.1',
+    version='0.2.2',
     description='Tesserocr bindings',
     long_description=codecs.open('README.rst', encoding='utf-8').read(),
     author='Konstantin Baierer',

diff --git a/test/test_recognize.py b/test/test_recognize.py
@@ -4,13 +4,14 @@
 from test.base import TestCase, main, assets, skip
 
 from ocrd.resolver import Resolver
-from ocrd_tesserocr.segment_word import TesserocrSegmentWord
-from ocrd_tesserocr.segment_line import TesserocrSegmentLine
-from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
-from ocrd_tesserocr.recognize import TesserocrRecognize
+from ocrd_tesserocr import TesserocrSegmentWord
+from ocrd_tesserocr import TesserocrSegmentLine
+from ocrd_tesserocr import TesserocrSegmentRegion
+from ocrd_tesserocr import TesserocrRecognize
 
 #METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')
-METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784/data/mets.xml')
+# as long as #96 remains, we cannot use workspaces which have local relative files:
+METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')
 
 WORKSPACE_DIR = '/tmp/pyocrd-test-recognizer'
 

diff --git a/test/test_segment_line.py b/test/test_segment_line.py
@@ -4,8 +4,8 @@
 from test.base import TestCase, main, assets
 
 from ocrd.resolver import Resolver
-from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
-from ocrd_tesserocr.segment_line import TesserocrSegmentLine
+from ocrd_tesserocr import TesserocrSegmentRegion
+from ocrd_tesserocr import TesserocrSegmentLine
 
 METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')
 
@@ -21,9 +21,17 @@ def setUp(self):
     def runTest(self):
         resolver = Resolver()
         workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR)
-        TesserocrSegmentRegion(workspace, input_file_grp="INPUT", output_file_grp="OCR-D-SEG-BLOCK").process()
+        TesserocrSegmentRegion(
+            workspace,
+            input_file_grp="OCR-D-IMG",
+            output_file_grp="OCR-D-SEG-BLOCK"
+        ).process()
         #  workspace.save_mets()
-        TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process()
+        TesserocrSegmentLine(
+            workspace,
+            input_file_grp="OCR-D-SEG-BLOCK",
+            output_file_grp="OCR-D-SEG-LINE"
+        ).process()
         workspace.save_mets()
 
 if __name__ == '__main__':

diff --git a/test/test_segment_region.py b/test/test_segment_region.py
@@ -4,7 +4,7 @@
 from test.base import TestCase, main, assets
 
 from ocrd.resolver import Resolver
-from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
+from ocrd_tesserocr import TesserocrSegmentRegion
 
 METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')
 
@@ -20,7 +20,11 @@ def setUp(self):
     def runTest(self):
         resolver = Resolver()
         workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR)
-        TesserocrSegmentRegion(workspace, input_file_grp="INPUT", output_file_grp="OCR-D-SEG-BLOCK").process()
+        TesserocrSegmentRegion(
+            workspace,
+            input_file_grp="OCR-D-IMG",
+            output_file_grp="OCR-D-SEG-BLOCK"
+        ).process()
         workspace.save_mets()
 
 if __name__ == '__main__':

diff --git a/test/test_segment_word.py b/test/test_segment_word.py
@@ -4,9 +4,9 @@
 from test.base import TestCase, main, assets
 
 from ocrd import Resolver
-from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
-from ocrd_tesserocr.segment_line import TesserocrSegmentLine
-from ocrd_tesserocr.segment_word import TesserocrSegmentWord
+from ocrd_tesserocr import TesserocrSegmentRegion
+from ocrd_tesserocr import TesserocrSegmentLine
+from ocrd_tesserocr import TesserocrSegmentWord
 
 #METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/mets_one_file.xml')
 METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')
@@ -23,9 +23,21 @@ def setUp(self):
     def runTest(self):
         resolver = Resolver()
         workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR)
-        TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process()
-        TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process()
-        TesserocrSegmentWord(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD").process()
+        TesserocrSegmentRegion(
+            workspace,
+            input_file_grp="OCR-D-IMG",
+            output_file_grp="OCR-D-SEG-BLOCK"
+        ).process()
+        TesserocrSegmentLine(
+            workspace,
+            input_file_grp="OCR-D-SEG-BLOCK",
+            output_file_grp="OCR-D-SEG-LINE"
+        ).process()
+        TesserocrSegmentWord(
+            workspace,
+            input_file_grp="OCR-D-SEG-LINE",
+            output_file_grp="OCR-D-SEG-WORD"
+        ).process()
         workspace.save_mets()
 
 if __name__ == '__main__':