Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding annotations from djfu file #72

Closed
wants to merge 12 commits into from
7 changes: 6 additions & 1 deletion iiify/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from flask_caching import Cache
from iiif2 import iiif, web
from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \
purify_domain, cantaloupe_resolver, create_collection3, IsCollection
purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations
from .configs import options, cors, approot, cache_root, media_root, \
cache_expr, version, image_server, cache_timeouts
from urllib.parse import quote
Expand Down Expand Up @@ -191,6 +191,11 @@ def manifest3(identifier):
raise excpt
# abort(404)

@app.route('/iiif/<version>/annotations/<identifier>/<fileName>/<canvas_no>.json')
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
def annnotations(version, identifier, fileName, canvas_no):
domain = purify_domain(request.args.get('domain', request.url_root))
return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain))

@app.route('/iiif/<identifier>/manifest.json')
@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust)
Expand Down
107 changes: 105 additions & 2 deletions iiify/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import json
import math
import re
import xml.etree.ElementTree as ET

IMG_CTX = 'http://iiif.io/api/image/2/context.json'
PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
Expand Down Expand Up @@ -37,6 +38,28 @@ def getids(q, limit=1000, cursor=''):
}, allow_redirects=True, timeout=None)
return r.json()

def checkMultiItem(metadata):
# Maybe add call to book stack to see if that works first

# Count the number of each original file
file_types = {}
for file in metadata['files']:
if file['source'] == "original":
if file['format'] not in file_types:
file_types[file['format']] = 0

file_types[file['format']] += 1
#print (file_types)

# If there is multiple files of the same type then return the first format
# Will have to see if there are objects with multiple images and formats
for format in file_types:
if file_types[format] > 1 and format.lower() in valid_filetypes:
return (True, format)

return (False, None)


def to_mimetype(format):
formats = {
"VBR MP3": "audio/mp3",
Expand Down Expand Up @@ -363,6 +386,7 @@ def addMetadata(item, identifier, metadata, collection=False):
item.metadata = manifest_metadata



def create_manifest3(identifier, domain=None, page=None):
# Get item metadata
metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json()
Expand All @@ -383,9 +407,12 @@ def create_manifest3(identifier, domain=None, page=None):
# subprefix can be different from the identifier use the scandata filename to find the correct prefix
# if not present fall back to identifier
subprefix = identifier
djvuFile = ""
for fileMd in metadata['files']:
if fileMd['name'].endswith('_scandata.xml'):
subprefix = fileMd['name'].replace('_scandata.xml', '')
if fileMd['format'] == 'Djvu XML':
djvuFile = fileMd['name']

bookReaderURL = f"https://{metadata.get('server')}/BookReader/BookReaderJSIA.php?id={identifier}&itemPath={metadata.get('dir')}&server={metadata.get('server')}&format=jsonp&subPrefix={subprefix}"

Expand Down Expand Up @@ -446,9 +473,47 @@ def create_manifest3(identifier, domain=None, page=None):
except:
pass


# Add annotations if djvu file is present
if djvuFile:
count = 1
for canvas in manifest.items:
if 'annotations' in canvas:
annotations = canvas.annotations
else:
annotations = []

annotations.append({
"id": f"{domain}3/annotations/{identifier}/{djvuFile}/{count}.json",
"type": "AnnotationPage"
})
canvas.annotations = annotations
count += 1
elif mediatype == 'image':
singleImage(metadata, identifier, manifest, uri)
(multiFile, format) = checkMultiItem(metadata)
print (f"Checking multiFile {multiFile} {format}")
if multiFile:
# Create multi file manifest
pageCount = 0
for file in metadata['files']:
if file['source'] == "original" and file['format'] == format:
imgId = f"{identifier}/{file['name']}".replace('/','%2f')
imgURL = f"{IMG_SRV}/3/{imgId}"
pageCount += 1

try:
manifest.make_canvas_from_iiif(url=imgURL,
id=f"{URI_PRIFIX}/{identifier}${pageCount}/canvas",
label=f"{file['name']}",
anno_page_id=f"{uri}/annotationPage/1",
anno_id=f"{uri}/annotation/1")
except requests.exceptions.HTTPError as error:
print (f'Failed to get {imgURL}')
manifest.make_canvas(label=f"Failed to load {file['name']} from Image Server",
summary=f"Got {error}",
id=f"{URI_PRIFIX}/{identifier}/canvas",
height=1800, width=1200)
else:
singleImage(metadata, identifier, manifest, uri)
elif mediatype == 'audio' or mediatype == 'etree':
# sort the files into originals and derivatives, splitting the derivatives into buckets based on the original
originals = []
Expand Down Expand Up @@ -566,6 +631,44 @@ def create_manifest3(identifier, domain=None, page=None):

return json.loads(manifest.jsonld())

def create_annotations(version, identifier, fileName, canvas_no, domain=None):
annotationPage = AnnotationPage(id=f"{domain}{version}/annotations/{identifier}/{fileName}/{canvas_no}.json")
annotationPage.items = []
index = int(canvas_no) - 1
url = f"{ARCHIVE}/download/{identifier}/{fileName}"
try:
# Fetch the remote XML file
response = requests.get(url)
response.raise_for_status() # Raise an error for bad status codes

# Parse the XML content
djfu = ET.fromstring(response.content)
page = djfu.findall(f".//OBJECT[{canvas_no}]")[0]
words = page.findall(".//WORD")
count = 1
for word in words:
annotationPage.items.append({
"id": f"https://iiif.archive.org/iiif/{identifier}/canvas/{index}/anno/{count}",
"type": "Annotation",
"motivation": "supplementing",
"body": {
"type": "TextualBody",
"format": "text/plain",
"value": word.text
},
"target": f"https://iiif.archive.org/iiif/{identifier}${index}/canvas#xywh={word.attrib['coords']}"
})
count += 1

except requests.exceptions.RequestException as e:
print(f"Error fetching the XML file: {e}")
raise ValueError("Failed to retrieve {url}")
except ET.ParseError as e:
print(f"Error parsing the XML content: {e}")
raise ValueError("Failed to process {url}")

return json.loads(annotationPage.jsonld())

def coerce_list(value):
if isinstance(value, list):
return ". ".join(value)
Expand Down
52 changes: 52 additions & 0 deletions tests/test_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import unittest
from flask.testing import FlaskClient
from iiify.app import app

class TestAnnotations(unittest.TestCase):

def setUp(self) -> None:
self.test_app = FlaskClient(app)

def test_v3_manifest_has_annotations(self):
resp = self.test_app.get("/iiif/3/journalofexpedit00ford/manifest.json?recache=true")
self.assertEqual(resp.status_code, 200)
manifest = resp.json

count = 1
for canvas in manifest['items']:
self.assertTrue('annotations' in canvas, f"Expected annotations in canvas {canvas['id']}")
annotations_url = f"https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/{count}.json"
found=False
for anno in canvas['annotations']:
if anno['id'] == annotations_url:
found=True
self.assertTrue(found, f"Expected to find {annotations_url} in {canvas['annotations']}")
count += 1

def test_v3_annotations(self):
resp = self.test_app.get("/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json?recache=true")
self.assertEqual(resp.status_code, 200)
annotations = resp.json

self.assertEqual(annotations['id'], "https://localhost/iiif/3/annotations/journalofexpedit00ford/journalofexpedit00ford_djvu.xml/1.json", "Unexpected id")
self.assertEqual(annotations['@context'], "http://iiif.io/api/presentation/3/context.json", "Unexpected context")
self.assertEqual(annotations['type'], "AnnotationPage", "Unexpected type, expected AnnotationPage")
annotationList = annotations['items']
self.assertEqual(len(annotationList), 6, "Unexpected number of annotations")

ids = []
first=True
for anno in annotationList:
self.assertTrue(anno['id'] not in ids,"Duplicate ID: {anno['id']}")
ids.append(anno['id'])
self.assertEqual(anno['type'], "Annotation", "Expected type of Annotation")
self.assertEqual(anno['motivation'], "supplementing", "Expected motivation of supplementing")
self.assertTrue("body" in anno and "target" in anno, "Body or target missing from annotation {anno}")
self.assertEqual(anno['body']['type'], "TextualBody", "Expected body to be a TextualBody")
self.assertEqual(anno['body']['format'], "text/plain", "Expected format to be a text/plain")
self.assertEqual(anno['target'].split('#')[0], "https://iiif.archive.org/iiif/journalofexpedit00ford$0/canvas")
if first:
self.assertEqual(anno['target'].split('#')[1],"xywh=592,1860,1052,1742")
self.assertEqual(anno['body']['value'],"JOURNAL ")

first=False
16 changes: 16 additions & 0 deletions tests/test_manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,22 @@ def test_metadata_array(self):
manifest = resp.json
self.assertTrue(len(manifest['summary']['none']) > 1, f"Expected multiple summary values, but got {manifest['summary']['none']}")

def test_multi_file_image(self):
resp = self.test_app.get("/iiif/3/arkivkopia.se-lms-G70-48.3/manifest.json")
self.assertEqual(resp.status_code, 200)
manifest = resp.json
self.assertEqual(len(manifest['items']),3, f"Expected three canvases, but got {len(manifest['items'])}")

firstCanvasId = manifest['items'][0]['id']
for i in range(1, len(manifest['items'])):
self.assertNotEqual(manifest['items'][i]['id'], firstCanvasId, 'Canvas Ids need to be unique')

def test_multi_file(self):
resp = self.test_app.get("/iiif/3/st-anthony-relics-01/manifest.json")
self.assertEqual(resp.status_code, 200)
manifest = resp.json
self.assertEqual(len(manifest['items']),6, f"Expected five canvases, but got {len(manifest['items'])}")


''' to test:
kaled_jalil (no derivatives)
Expand Down
Loading