Skip to content

Commit

Permalink
Merge pull request #35 from bertsky/overwrite-lines
Browse files Browse the repository at this point in the history
improve consistency and quality of results
  • Loading branch information
bertsky authored May 20, 2019
2 parents 8bd69a6 + 56e8f1d commit 2e5778d
Show file tree
Hide file tree
Showing 16 changed files with 156 additions and 46 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ env:
matrix:
# - TESSERACT=3.04.01-1
- TESSERACT=3.05.02-3
- TESSERACT=4.0.0-beta.4-2
# - TESSERACT=4.0.0-beta.4-2
- TESSERACT=4.0.0-1

before_install:
# - export TESSERACT_INSTALL=$HOME/.tesseract
Expand Down
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,20 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

## [0.2.2] - 2019-05-16

Changed:

* Add simple page cropping processor crop
* Respect border cropping in segment_word
* Add parameter overwrite_words in recognize
* Make higher TextEquivs consistent after recognize

Fixed:

* Remove invalid @externalRef from MetadataItem
* Retain pageId in output (i.e. link to structMap)

## [0.2.1] - 2019-02-28

Fixed:
Expand Down
15 changes: 10 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,22 @@ help:
# (lib*-dev merely for building tesserocr with pip)
# (tesseract-ocr: Ubuntu 18.04 now ships 4.0.0,
# but some beta, not the final release,
# on which tesserocr 2.4.0 depends,
# but we can instead downgrade to 2.3.1;
# in the future, however, we might require
# a tesseract build from git)
# on which tesserocr 2.4.0 depends;
# this downloads a tesseract build from git
# and installs it system-wide -
# intended for dockerfile and travis,
# not recommended for live systems!)
deps-ubuntu:
sudo apt-get install -y \
libxml2-utils \
libimage-exiftool-perl \
libtesseract-dev \
libleptonica-dev \
tesseract-ocr tesseract-ocr-eng
tesseract-ocr-eng \
tesseract-ocr \
wget
wget -O - https://github.com/nijel/tesseract-ocr-build/releases/download/4.0.0-1/tesseract.tar.xz | tar -xJf -
sudo cp -rt /usr .tesseract/*

# Install python deps via pip
deps:
Expand Down
9 changes: 9 additions & 0 deletions ocrd_tesserocr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
import locale
# circumvent tesseract-ocr issue #1670
# (which cannot be done on command line
# because Click requires an UTF-8 locale
# in Python 3):
# pylint: disable=wrong-import-position
locale.setlocale(locale.LC_ALL, 'C.UTF-8')

from .recognize import TesserocrRecognize
from .segment_word import TesserocrSegmentWord
from .segment_line import TesserocrSegmentLine
from .segment_region import TesserocrSegmentRegion
from .crop import TesserocrCrop
1 change: 1 addition & 0 deletions ocrd_tesserocr/crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def process(self):
self.workspace.add_file(
ID=ID,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename='%s/%s' % (self.output_file_grp, ID),
content=to_xml(pcgts).encode('utf-8'),
Expand Down
5 changes: 5 additions & 0 deletions ocrd_tesserocr/ocrd-tool.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
"default": "line",
"description": "PAGE XML hierarchy level to add the TextEquiv results to (requires existing layout annotation up to one level above that)"
},
"overwrite_words": {
"type": "boolean",
"default": false,
"description": "remove existing layout and text annotation below the TextLine level (regardless of textequiv_level)"
},
"model": {
"type": "string",
"description": "tessdata model to apply (an ISO 639-3 language specification or some other basename, e.g. deu-frak or Fraktur)"
Expand Down
89 changes: 68 additions & 21 deletions ocrd_tesserocr/recognize.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,23 @@
from __future__ import absolute_import
import math

import locale
from tesserocr import (
RIL, PSM,
PyTessBaseAPI, get_languages,
Orientation, TextlineOrder, WritingDirection)

# pylint: disable=wrong-import-position
locale.setlocale(locale.LC_ALL, 'C') # circumvent tesseract-ocr issue 1670 (which cannot be done on command line because Click requires an UTF-8 locale in Python 3)

from tesserocr import RIL, PSM, PyTessBaseAPI, get_languages

from ocrd_utils import getLogger, concat_padded, xywh_from_points, points_from_x0y0x1y1, MIMETYPE_PAGE
from ocrd_modelfactory import page_from_file
from ocrd_utils import (
getLogger, concat_padded,
polygon_from_points, xywh_from_points, points_from_x0y0x1y1,
MIMETYPE_PAGE)
from ocrd_models.ocrd_page import (
CoordsType,
GlyphType,
LabelType,
LabelsType,
GlyphType, WordType,
LabelType, LabelsType,
MetadataItemType,
TextEquivType,
TextStyleType,

to_xml
)
TextEquivType, TextStyleType,
to_xml)
from ocrd_modelfactory import page_from_file
from ocrd import Processor
from .config import TESSDATA_PREFIX, OCRD_TOOL

Expand All @@ -37,10 +35,20 @@ def __init__(self, *args, **kwargs):
super(TesserocrRecognize, self).__init__(*args, **kwargs)

def process(self):
"""Perform OCR recognition with Tesseract on the workspace.
Open and deserialise PAGE input files and their respective images,
then iterate over the element hierarchy down to the requested
`textequiv_level`. If `overwrite_words` is enabled and any layout
annotation below the line level already exists, then remove it
(regardless of `textequiv_level`).
Set up Tesseract to recognise each segment's image rectangle with
the appropriate mode and `model`. Create new elements below the line
level if necessary. Put text results and confidence values into new
TextEquiv at `textequiv_level`, and make the higher levels consistent
with that (by concatenation joined by whitespace). Produce new output
files by serialising the resulting hierarchy.
"""
Performs the (text) recognition.
"""
# print(self.parameter)
log.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages())
maxlevel = self.parameter['textequiv_level']
model = get_languages()[1][-1] # last installed model
Expand Down Expand Up @@ -108,7 +116,9 @@ def process(self):
MetadataItemType(type_="processingStep",
name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize']['steps'][0],
value='ocrd-tesserocr-recognize',
Labels=[LabelsType(externalRef="parameters",
# FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
# what we want here is `externalModel="ocrd-tool" externalId="parameters"`
Labels=[LabelsType(#externalRef="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))
Expand All @@ -117,10 +127,12 @@ def process(self):
if not regions:
log.warning("Page contains no text regions")
self._process_regions(regions, maxlevel, tessapi)
page_update_higher_textequiv_levels(maxlevel, pcgts)
ID = concat_padded(self.output_file_grp, n)
self.workspace.add_file(
ID=ID,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename='%s/%s' % (self.output_file_grp, ID),
content=to_xml(pcgts),
Expand Down Expand Up @@ -167,6 +179,8 @@ def _process_regions(self, regions, maxlevel, tessapi):

def _process_lines(self, textlines, maxlevel, tessapi):
for line in textlines:
if self.parameter['overwrite_words']:
line.set_Word([])
log.debug("Recognizing text in line '%s'", line.id)
line_xywh = xywh_from_points(line.get_Coords().points)
# log.debug("xywh: %s", line_xywh)
Expand Down Expand Up @@ -198,7 +212,7 @@ def _process_words_in_line(self, line, maxlevel, result_it):
log.error("No iterator at '%s'", line.id)
break
if result_it.Empty(RIL.WORD):
log.debug("No word here")
log.warning("No word in line '%s'", line.id)
break
word_id = '%s_word%04d' % (line.id, word_no)
log.debug("Recognizing text in word '%s'", word_id)
Expand Down Expand Up @@ -313,3 +327,36 @@ def _process_glyphs_in_word(self, word, result_it):
break
else:
result_it.Next(RIL.SYMBOL)

def page_update_higher_textequiv_levels(level, pcgts):
'''Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency.
Starting with the hierarchy level chosen for processing,
join all first TextEquiv (by the rules governing the respective level)
into TextEquiv of the next higher level, replacing them.
'''
regions = pcgts.get_Page().get_TextRegion()
if level != 'region':
for region in regions:
lines = region.get_TextLine()
if level != 'line':
for line in lines:
words = line.get_Word()
if level != 'word':
for word in words:
glyphs = word.get_Glyph()
word_unicode = u''.join(glyph.get_TextEquiv()[0].Unicode
if glyph.get_TextEquiv()
else u'' for glyph in glyphs)
word.set_TextEquiv(
[TextEquivType(Unicode=word_unicode)]) # remove old
line_unicode = u' '.join(word.get_TextEquiv()[0].Unicode
if word.get_TextEquiv()
else u'' for word in words)
line.set_TextEquiv(
[TextEquivType(Unicode=line_unicode)]) # remove old
region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode
if line.get_TextEquiv()
else u'' for line in lines)
region.set_TextEquiv(
[TextEquivType(Unicode=region_unicode)]) # remove old
1 change: 1 addition & 0 deletions ocrd_tesserocr/segment_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def process(self):
self.workspace.add_file(
ID=ID,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename='%s/%s' % (self.output_file_grp, ID),
content=to_xml(pcgts).encode('utf-8'),
Expand Down
1 change: 1 addition & 0 deletions ocrd_tesserocr/segment_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def process(self):
self.workspace.add_file(
ID=ID,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename='%s/%s' % (self.output_file_grp, ID),
content=to_xml(pcgts).encode('utf-8'),
Expand Down
1 change: 1 addition & 0 deletions ocrd_tesserocr/segment_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def process(self):
self.workspace.add_file(
ID=ID,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
local_filename='%s/%s' % (self.output_file_grp, ID),
mimetype=MIMETYPE_PAGE,
content=to_xml(pcgts).encode('utf-8'),
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
ocrd >= 1.0.0b5
click
tesserocr == 2.3.1
tesserocr >= 2.3.1
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

setup(
name='ocrd_tesserocr',
version='0.2.1',
version='0.2.2',
description='Tesserocr bindings',
long_description=codecs.open('README.rst', encoding='utf-8').read(),
author='Konstantin Baierer',
Expand Down
11 changes: 6 additions & 5 deletions test/test_recognize.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
from test.base import TestCase, main, assets, skip

from ocrd.resolver import Resolver
from ocrd_tesserocr.segment_word import TesserocrSegmentWord
from ocrd_tesserocr.segment_line import TesserocrSegmentLine
from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
from ocrd_tesserocr.recognize import TesserocrRecognize
from ocrd_tesserocr import TesserocrSegmentWord
from ocrd_tesserocr import TesserocrSegmentLine
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_tesserocr import TesserocrRecognize

#METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')
METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784/data/mets.xml')
# as long as #96 remains, we cannot use workspaces which have local relative files:
METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')

WORKSPACE_DIR = '/tmp/pyocrd-test-recognizer'

Expand Down
16 changes: 12 additions & 4 deletions test/test_segment_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from test.base import TestCase, main, assets

from ocrd.resolver import Resolver
from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
from ocrd_tesserocr.segment_line import TesserocrSegmentLine
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_tesserocr import TesserocrSegmentLine

METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')

Expand All @@ -21,9 +21,17 @@ def setUp(self):
def runTest(self):
resolver = Resolver()
workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR)
TesserocrSegmentRegion(workspace, input_file_grp="INPUT", output_file_grp="OCR-D-SEG-BLOCK").process()
TesserocrSegmentRegion(
workspace,
input_file_grp="OCR-D-IMG",
output_file_grp="OCR-D-SEG-BLOCK"
).process()
# workspace.save_mets()
TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process()
TesserocrSegmentLine(
workspace,
input_file_grp="OCR-D-SEG-BLOCK",
output_file_grp="OCR-D-SEG-LINE"
).process()
workspace.save_mets()

if __name__ == '__main__':
Expand Down
8 changes: 6 additions & 2 deletions test/test_segment_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from test.base import TestCase, main, assets

from ocrd.resolver import Resolver
from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
from ocrd_tesserocr import TesserocrSegmentRegion

METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml')

Expand All @@ -20,7 +20,11 @@ def setUp(self):
def runTest(self):
resolver = Resolver()
workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR)
TesserocrSegmentRegion(workspace, input_file_grp="INPUT", output_file_grp="OCR-D-SEG-BLOCK").process()
TesserocrSegmentRegion(
workspace,
input_file_grp="OCR-D-IMG",
output_file_grp="OCR-D-SEG-BLOCK"
).process()
workspace.save_mets()

if __name__ == '__main__':
Expand Down
24 changes: 18 additions & 6 deletions test/test_segment_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from test.base import TestCase, main, assets

from ocrd import Resolver
from ocrd_tesserocr.segment_region import TesserocrSegmentRegion
from ocrd_tesserocr.segment_line import TesserocrSegmentLine
from ocrd_tesserocr.segment_word import TesserocrSegmentWord
from ocrd_tesserocr import TesserocrSegmentRegion
from ocrd_tesserocr import TesserocrSegmentLine
from ocrd_tesserocr import TesserocrSegmentWord

#METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/mets_one_file.xml')
METS_HEROLD_SMALL = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')
Expand All @@ -23,9 +23,21 @@ def setUp(self):
def runTest(self):
resolver = Resolver()
workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR)
TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process()
TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process()
TesserocrSegmentWord(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD").process()
TesserocrSegmentRegion(
workspace,
input_file_grp="OCR-D-IMG",
output_file_grp="OCR-D-SEG-BLOCK"
).process()
TesserocrSegmentLine(
workspace,
input_file_grp="OCR-D-SEG-BLOCK",
output_file_grp="OCR-D-SEG-LINE"
).process()
TesserocrSegmentWord(
workspace,
input_file_grp="OCR-D-SEG-LINE",
output_file_grp="OCR-D-SEG-WORD"
).process()
workspace.save_mets()

if __name__ == '__main__':
Expand Down

0 comments on commit 2e5778d

Please sign in to comment.