Adding annotations

internetarchive · May 23, 2024 · 43c8939 · 43c8939
1 parent e3ef404
commit 43c8939
Show file tree

Hide file tree

Showing 3 changed files with 115 additions and 2 deletions.
diff --git a/iiify/app.py b/iiify/app.py
@@ -8,7 +8,7 @@
 from flask_caching import Cache
 from iiif2 import iiif, web
 from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \
-    purify_domain, cantaloupe_resolver, create_collection3, IsCollection
+    purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations
 from .configs import options, cors, approot, cache_root, media_root, \
     cache_expr, version, image_server, cache_timeouts
 from urllib.parse import quote
@@ -191,6 +191,11 @@ def manifest3(identifier):
         raise excpt
         # abort(404)
 
+@app.route('/iiif/<version>/annotations/<identifier>/<fileName>/<canvas_no>.json')
+@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
+def annnotations(version, identifier, fileName, canvas_no):
+    domain = purify_domain(request.args.get('domain', request.url_root))
+    return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain))
 
 @app.route('/iiif/<identifier>/manifest.json')
 @cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)

diff --git a/iiify/resolver.py b/iiify/resolver.py
@@ -9,6 +9,7 @@
 import json
 import math 
 import re
+import xml.etree.ElementTree as ET
 
 IMG_CTX = 'http://iiif.io/api/image/2/context.json'
 PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
@@ -406,9 +407,12 @@ def create_manifest3(identifier, domain=None, page=None):
         # subprefix can be different from the identifier use the scandata filename to find the correct prefix
         # if not present fall back to identifier
         subprefix = identifier
+        djvuFile = ""
         for fileMd in metadata['files']:
             if fileMd['name'].endswith('_scandata.xml'):
                 subprefix = fileMd['name'].replace('_scandata.xml', '')
+            if fileMd['format'] == 'Djvu XML':    
+                djvuFile = fileMd['name']
 
         bookReaderURL = f"https://{metadata.get('server')}/BookReader/BookReaderJSIA.php?id={identifier}&itemPath={metadata.get('dir')}&server={metadata.get('server')}&format=jsonp&subPrefix={subprefix}"
 
@@ -469,7 +473,21 @@ def create_manifest3(identifier, domain=None, page=None):
             except:
                 pass
 
-
+        # Add annotations if djvu file is present
+        if djvuFile:
+            count = 1
+            for canvas in manifest.items:
+                if 'annotations' in canvas:
+                    annotations = canvas.annotations
+                else:
+                    annotations = []
+
+                annotations.append({
+                    "id": f"{domain}3/annotations/{identifier}/{djvuFile}/{count}.json",
+                    "type": "AnnotationPage"
+                })         
+                canvas.annotations = annotations
+                count += 1
     elif mediatype == 'image':
         (multiFile, format) = checkMultiItem(metadata)
         print (f"Checking multiFile {multiFile} {format}")
@@ -613,6 +631,44 @@ def create_manifest3(identifier, domain=None, page=None):
 
     return json.loads(manifest.jsonld())
 
+def create_annotations(version, identifier, fileName, canvas_no, domain=None):
+    annotationPage = AnnotationPage(id=f"{domain}{version}/annotations/{identifier}/{fileName}/{canvas_no}.json")
+    annotationPage.items = []
+    index = int(canvas_no) - 1
+    url = f"{ARCHIVE}/download/{identifier}/{fileName}"
+    try:
+        # Fetch the remote XML file
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an error for bad status codes
+
+        # Parse the XML content
+        djfu = ET.fromstring(response.content)
+        page = djfu.findall(f".//OBJECT[{canvas_no}]")[0]
+        words = page.findall(".//WORD")
+        count = 1
+        for word in words:
+            annotationPage.items.append({
+                "id": f"https://iiif.archive.org/iiif/{identifier}/canvas/{index}/anno/{count}",
+                "type": "Annotation",
+                "motivation": "supplementing",
+                "body": {
+                    "type": "TextualBody",
+                    "format": "text/plain",
+                    "value": word.text
+                },
+                "target": f"https://iiif.archive.org/iiif/{identifier}${index}/canvas#xywh={word.attrib['coords']}"
+            })
+            count += 1
+
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching the XML file: {e}")
+        raise ValueError("Failed to retrieve {url}")
+    except ET.ParseError as e:
+        print(f"Error parsing the XML content: {e}")
+        raise ValueError("Failed to process {url}")
+
+    return json.loads(annotationPage.jsonld())
+
 def coerce_list(value):
     if isinstance(value, list):
         return ". ".join(value)

diff --git a/tests/test_annotations.py b/tests/test_annotations.py
@@ -0,0 +1,52 @@
+import unittest
+from flask.testing import FlaskClient
+from iiify.app import app
+
+class TestAnnotations(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.test_app = FlaskClient(app)
+
+    def test_v3_manifest_has_annotations(self):
+        resp = self.test_app.get("/iiif/3/journalofexpedit00ford/manifest.json?recache=true")
+        self.assertEqual(resp.status_code, 200)
+        manifest = resp.json    
+
+        count = 1
+        for canvas in manifest['items']:
+            self.assertTrue('annotations' in canvas, f"Expected annotations in canvas {canvas['id']}")
+            annotations_url = f"https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/{count}.json"
+            found=False
+            for anno in canvas['annotations']:
+                if anno['id'] == annotations_url:
+                    found=True
+            self.assertTrue(found, f"Expected to find {annotations_url} in {canvas['annotations']}")        
+            count += 1
+
+    def test_v3_annotations(self):
+        resp = self.test_app.get("/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json?recache=true")
+        self.assertEqual(resp.status_code, 200)
+        annotations = resp.json   
+
+        self.assertEqual(annotations['id'], "https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json", "Unexpected id")
+        self.assertEqual(annotations['@context'], "http://iiif.io/api/presentation/3/context.json", "Unexpected context")
+        self.assertEqual(annotations['type'], "AnnotationPage", "Unexpected type, expected AnnotationPage")
+        annotationList = annotations['items']
+        self.assertEqual(len(annotationList), 6, "Unexpected number of annotations")
+
+        ids = []
+        first=True
+        for anno in annotationList:
+            self.assertTrue(anno['id'] not in ids,"Duplicate ID: {anno['id']}")
+            ids.append(anno['id'])
+            self.assertEqual(anno['type'], "Annotation", "Expected type of Annotation")
+            self.assertEqual(anno['motivation'], "supplementing", "Expected motivation of supplementing")
+            self.assertTrue("body" in anno and "target" in anno, "Body or target missing from annotation {anno}")
+            self.assertEqual(anno['body']['type'], "TextualBody", "Expected body to be a TextualBody")
+            self.assertEqual(anno['body']['format'], "text/plain", "Expected format to be a text/plain")
+            self.assertEqual(anno['target'].split('#')[0], "https://iiif.archive.org/iiif/journalofexpedit00ford$0/canvas")
+            if first:
+                self.assertEqual(anno['target'].split('#')[1],"xywh=592,1860,1052,1742")
+                self.assertEqual(anno['body']['value'],"JOURNAL ")
+
+            first=False