wikimedia · stweil · Sep 22, 2023 · stweil · Sep 22, 2023 · samwilson
diff --git a/i18n/en.json b/i18n/en.json
@@ -32,6 +32,7 @@
     "report-issue": "Report an issue",
     "langs-placeholder": "Leave blank for automatic language detection.",
     "langs-param-error": "The following {{PLURAL:$1|language is|languages are}} not supported by the OCR engine: $2",
+    "normalize-ocr-text": "Normalize the text from OCR",
     "tesseract-options": "Tesseract options",
     "tesseract-psm-label": "Page segmentation method",
     "tesseract-psm-help": "Try \"Sparse text\" for better multi-column support.",

diff --git a/i18n/qqq.json b/i18n/qqq.json
@@ -39,6 +39,7 @@
 	"report-issue": "Link text in the footer for the issue-reporting link.",
 	"langs-placeholder": "Placeholder text for the language input field.",
 	"langs-param-error": "Error message displayed when invalid language(s) are submitted.\n\nParameters:\n* $1 – number of invalid languages\n* $2 - the list of invalid languages\n\nOCR is a common abbreviation in English for \"Optical Characters Recognition\".",
+	"normalize-ocr-text": "Normalize the text from OCR (replaces long s and some other historic characters)",
 	"tesseract-options": "Heading for Tesseract-specific options.",
 	"tesseract-psm-label": "Form label for the Tesseract page segmentation mode.",
 	"tesseract-psm-help": "Help text for the Tesseract page segmentation mode option. 'Sparse text' refers to options, see messages:\n* {{msg-wm|Wikimedia-ocr-tesseract-psm-11}} and\n* {{msg-wm|Wikimedia-ocr-tesseract-psm-12}}.",

diff --git a/src/Controller/OcrController.php b/src/Controller/OcrController.php
@@ -60,6 +60,7 @@ class OcrController extends AbstractController {
 		'image' => '',
 		'engine' => self::DEFAULT_ENGINE,
 		'langs' => [],
+		'normalize' => false,
 		'psm' => TesseractEngine::DEFAULT_PSM,
 		'crop' => [],
 		'line_id' => TranskribusEngine::DEFAULT_LINEID,
@@ -112,6 +113,7 @@ private function setup(): void {
 		}
 		static::$params['langs'] = $this->getLangs( $this->request );
 		static::$params['image_hosts'] = $this->engine->getImageHosts();
+		static::$params['normalize'] = $this->request->query->get( 'normalize' );
 		$crop = $this->request->query->get( 'crop' );
 		if ( !is_array( $crop ) ) {
 			$crop = [];
@@ -228,6 +230,12 @@ public function homeAction(): Response {
 	 * @OA\JsonContent(type="array", @OA\Items(type="string"))
 	 * )
 	 * @OA\Parameter(
+	 *     name="normalize",
+	 *     in="query",
+	 *     description="Normalize OCR text.",
+	 * @OA\Schema(type="boolean")
+	 * )
+	 * @OA\Parameter(
 	 *     name="psm",
 	 *     in="query",
 	 *     description="The Page Segmentation Mode for Tesseract.",
@@ -365,6 +373,9 @@ private function getResult( string $invalidLangsMode ): EngineResult {
 		if ( !$result instanceof EngineResult ) {
 			throw new Exception( 'Incorrect (possibly cached) result: ' . var_export( $result, true ) );
 		}
+		if ( static::$params['normalize'] ) {
+			$result->normalize();
+		}
 		return $result;
 	}
 }
diff --git a/src/Engine/EngineResult.php b/src/Engine/EngineResult.php
@@ -35,4 +35,19 @@ public function getText(): string {
 	public function getWarnings(): array {
 		return $this->warnings;
 	}
+
+	/**
+	 * Normalize result by replacing some historic characters
+	 */
+	public function normalize() {
+		$this->text = strtr( $this->text, [
+			'ſ' => 's',
+			'ꝛ' => 'r',
+			'ℳ' => 'M',
+			'aͤ' => 'ä',
+			'oͤ' => 'ö',
+			'uͤ' => 'ü',
+			'⸗' => '-',
+		] );
+	}
 }
diff --git a/templates/output.html.twig b/templates/output.html.twig
@@ -56,6 +56,10 @@
                 </select>
                 {% include '_transkribus_help.html.twig' with {engine: engine} %}
             </div>
+            <div class="form-group">
+                <input type="checkbox" id="normalize" name="normalize" value="1">
+                <label for="normalize">{{ msg('normalize-ocr-text') }}</label>
+            </div>
         </fieldset>
 
         {% include '_tesseract_options.html.twig' with {engine: engine} %}