Adding list of renderings and seeAlsos

internetarchive · May 23, 2024 · a1900d9 · a1900d9
1 parent 6238b9b
commit a1900d9
Show file tree

Hide file tree

Showing 4 changed files with 272 additions and 24 deletions.
diff --git a/iiify/configs/__init__.py b/iiify/configs/__init__.py
@@ -14,6 +14,7 @@
 import sys
 import types
 import configparser
+import json
 
 path = os.path.dirname(os.path.realpath(__file__))
 approot = os.path.abspath(os.path.join(path, os.pardir))
@@ -75,3 +76,6 @@ def getdef(self, section, option, default_value):
     "long": 432000,  # 5 days
     "longest": 2592000  # 30 days
 }
+
+with open('%s/links.json' % path, 'r') as file:
+    LINKS = json.load(file)
diff --git a/iiify/configs/links.json b/iiify/configs/links.json
@@ -0,0 +1,137 @@
+{
+    "Animated GIF": {
+        "field": "rendering",
+        "type": "Image",
+        "format": "image/gif"
+    },
+    "Text PDF": {
+	    "field": "rendering",
+        "type": "Text",
+        "format": "application/pdf"
+    },
+    "Abbyy GZ": {
+        "field": "rendering",
+        "type": "Dataset",
+        "format": "application/gzip"
+    },
+    "Archive BitTorrent": {
+        "field": "rendering",
+        "type": "Dataset",
+        "format": "application/x-bittorrent"
+    },
+    "Grayscale PDF": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/pdf"
+    },
+    "chOCR": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/gzip"
+    },
+    "DjVuTXT": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "text/plain"
+    },
+    "Djvu XML": {
+        "field": "rendering",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "hOCR": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "text/html"
+    },
+    "Single Page Processed JP2 ZIP": {
+        "field": "rendering",
+        "type": "Image",
+        "format": "application/zip"
+    },
+    "OCR Search Text": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/gzip"
+    },
+    "Single Page Original JP2 Tar": {
+        "field": "rendering",
+        "type": "Image",
+        "format": "application/x-tar"
+    },
+    "DjVu": {
+        "field": "rendering",
+        "type": "Image",
+        "format": "image/vnd.djvu"
+    },
+    "Cloth Cover Detection Log": {
+        "field": "seeAlso",
+        "type": "Text",
+        "format": "text/plain"
+    },
+    "Dublin Core": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "OCR Page Index": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/json"
+    },
+    "MARC": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "MARC Binary": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/marc"
+    },
+    "MARC Source": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "Page Numbers JSON": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/json"
+    },
+    "Scandata": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "SubRip": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "text/plain"
+    },
+    "Web Video Text Tracks": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "text/vtt"
+    },
+    "Intermediate ASR JSON": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/json"
+    },
+    "Whisper ASR JSON": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/json"
+    },
+    "Storj Upload Log": {
+        "field": "seeAlso",
+        "type": "Text",
+        "format": "text/plain"
+    },
+    "Storj Upload Trigger": {
+        "field": "seeAlso",
+        "type": "Text",
+        "format": "text/plain"
+    }
+}
diff --git a/iiify/resolver.py b/iiify/resolver.py
@@ -3,17 +3,16 @@
 import os
 import requests
 from iiif2 import iiif, web
-from .configs import options, cors, approot, cache_root, media_root, apiurl
+from .configs import options, cors, approot, cache_root, media_root, apiurl, LINKS
 from iiif_prezi3 import Manifest, config, Annotation, AnnotationPage, Canvas, Manifest, ResourceItem, ServiceItem, Choice, Collection, ManifestRef, CollectionRef
 from urllib.parse import urlparse, parse_qs, quote
 import json
 import math 
 import re
-import mimetypes
 
 IMG_CTX = 'http://iiif.io/api/image/2/context.json'
 PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
-ARCHIVE = 'http://archive.org'
+ARCHIVE = 'https://archive.org'
 IMG_SRV = 'https://iiif.archive.org/image/iiif'
 METADATA_FIELDS = ("title", "volume", "publisher", "subject", "date", "contributor", "creator")
 bookdata = 'http://%s/BookReader/BookReaderJSON.php'
@@ -387,8 +386,6 @@ def addMetadata(item, identifier, metadata, collection=False):
 
 def addSeeAlso(manifest, identifier, files):
 
-    mimetypes.add_type("application/gzip", ".gz")
-
     manifest.seeAlso = [
         {"id": f"{ARCHIVE}/metadata/{identifier}",
          "type": "Metadata",
@@ -407,36 +404,47 @@ def addSeeAlso(manifest, identifier, files):
     }
 
     for file in files:
-        if file['format'] in SEEALSO_TYPES:
+        if file['format'] in LINKS and LINKS[file['format']]['field'] == 'seeAlso':
+            seeAlso = LINKS[file['format']]
             manifest.seeAlso.append(
                 {"id": f"{ARCHIVE}/download/{identifier}/{file['name']}",
-                 "type": SEEALSO_TYPES[file["format"]],
+                 "type": seeAlso['type'],
                  "label": {"en": [file["format"]]},
-                 "format": mimetypes.types_map.get(f".{file['name'].rsplit('.', 1)[1]}", "application/octet-stream")
+                 "format": seeAlso['format']
                  })
 
 
 def addRendering(manifest, identifier, files):
-    RENDERING_TYPES = {
-        "Item Tile": "Image",
-        "Text PDF": "PDF",
-        "Animated GIF": "Image",
-        "DjVuTXT": "Text",
-        "Generic Raw Book Zip": "Images",
-        "Single Page Processed JP2 Zip": "Images",
-    }
-
     manifest.rendering = []
 
     for file in files:
-        if file['format'] in RENDERING_TYPES:
+        if file['format'] in LINKS and LINKS[file['format']]['field'] == 'rendering':
+            rendering = LINKS[file['format']]
             manifest.rendering.append(
                 {"id": f"{ARCHIVE}/download/{identifier}/{file['name']}",
-                 "type": RENDERING_TYPES[file["format"]],
+                 "type": rendering['type'],
                  "label": {"en": [file["format"]]},
-                 "format": mimetypes.guess_type(file["name"])[0]
+                 "format": rendering['format']
                  })
 
+def addThumbnails(manifest, identifier, files):
+    thumbnails = []
+
+    for file in files:
+        if file['format'] == "Thumbnail":
+            mimetype = "image/jpeg"
+            if file['name'].endswith('.png'):
+                mimetype = "image/png"
+
+            thumbnails.append({
+                "id": f"{ARCHIVE}/download/{identifier}/{file['name']}",
+                "type": "Image",
+                "format": mimetype,
+            })
+
+    if thumbnails:
+        manifest.thumbnail = thumbnails
+
 def create_manifest3(identifier, domain=None, page=None):
     # Get item metadata
     metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json()
@@ -453,6 +461,7 @@ def create_manifest3(identifier, domain=None, page=None):
     addMetadata(manifest, identifier, metadata['metadata'])
     addSeeAlso(manifest, identifier, metadata['files'])
     addRendering(manifest, identifier, metadata['files'])
+    addThumbnails(manifest, identifier, metadata['files'])
 
     if mediatype == 'texts':
         # Get bookreader metadata (mostly for filenames and height / width of image)
@@ -743,10 +752,8 @@ def ia_resolver(identifier):
 
 def cantaloupe_resolver(identifier):
     """Resolves an existing Image Service identifier to what it should be with the new Cantaloupe setup"""
-    print("called with identifier:", identifier)
     leaf = None
     if "$" in identifier:
-        print("$ in identifier")
         identifier, leaf = identifier.split("$", 1)
 
     metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json()
@@ -756,8 +763,6 @@ def cantaloupe_resolver(identifier):
 
     mediatype = metadata['metadata']['mediatype'].lower()
     files = metadata['files']
-    print("mediatype:", mediatype)
-    print("leaf:", leaf)
     if mediatype == "image":
         # single image file - find the filename
 

diff --git a/tests/test_linking.py b/tests/test_linking.py
@@ -0,0 +1,102 @@
+import unittest
+from flask.testing import FlaskClient
+from iiify.app import app
+
+class TestLinking(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.test_app = FlaskClient(app)
+
+    def convertListToHash(self, items):
+        map = {}
+        for item in items:
+            map[item['label']['en'][0]] = item
+        return map    
+
+    def checkLink(self, map, field, name, value):        
+        self.assertTrue(name in map, f"Expected to find {name} in {field}")
+
+        self.assertEqual(map[name]['id'], value, f"Expected {value} in {map[name]}")
+
+    def test_v3_image_links(self):
+        resp = self.test_app.get("/iiif/3/journalofexpedit00ford/manifest.json?recache=true")
+        self.assertEqual(resp.status_code, 200)
+        manifest = resp.json    
+
+        self.assertTrue('rendering' in manifest, "Expected rendering in Manifest")
+        renderingMap = self.convertListToHash(manifest['rendering'])
+        # Animated GIF - rendering
+        self.checkLink(renderingMap, "rendering", "Animated GIF", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford.gif")
+        # Text PDF - rendering
+        self.checkLink(renderingMap, "rendering", "Text PDF", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford.pdf")
+        # Abbyy GZ - rendering
+        self.checkLink(renderingMap, "rendering", "Abbyy GZ", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_abbyy.gz")
+        # Archive BitTorrent - rendering
+        self.checkLink(renderingMap, "rendering", "Archive BitTorrent", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_archive.torrent")
+        # Grayscale PDF - rendering
+        self.checkLink(renderingMap, "rendering", "Grayscale PDF", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_bw.pdf")
+        # chOCR - rendering
+        self.checkLink(renderingMap, "rendering", "chOCR", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_chocr.html.gz")
+        # DjVuTXT - rendering
+        self.checkLink(renderingMap, "rendering", "DjVuTXT", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_djvu.txt")
+        # Djvu XML - rendering
+        self.checkLink(renderingMap, "rendering", "Djvu XML", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_djvu.xml")
+        # hOCR - rendering
+        self.checkLink(renderingMap, "rendering", "hOCR", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_hocr.html")
+        # Single Page Processed JP2 ZIP - rendering
+        self.checkLink(renderingMap, "rendering", "Single Page Processed JP2 ZIP", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_jp2.zip")
+        # OCR Search Text - rendering
+        self.checkLink(renderingMap, "rendering", "OCR Search Text", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_hocr_searchtext.txt.gz")
+        # Single Page Original JP2 Tar - rendering
+        self.checkLink(renderingMap, "rendering", "Single Page Original JP2 Tar", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_orig_jp2.tar")
+        # DjVu - rendering
+        self.checkLink(renderingMap, "rendering", "DjVu", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford.djvu")
+
+        self.assertTrue('seeAlso' in manifest, "Expected seeAlso in Manifest")
+        seeAlsoMap = self.convertListToHash(manifest['seeAlso'])
+        # Cloth Cover Detection Log - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "Cloth Cover Detection Log", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_cloth_detection.log")
+        # Dublin Core - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "Dublin Core", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_dc.xml")
+        # OCR Page Index - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "OCR Page Index", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_hocr_pageindex.json.gz")
+        # MARC - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "MARC", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_marc.xml")
+        # MARC Binary - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "MARC Binary", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_meta.mrc")
+        # MARC Source - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "MARC Source", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_metasource.xml")
+        # Page Numbers JSON - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "Page Numbers JSON", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_page_numbers.json")
+        # Scandata - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "Scandata", "https://archive.org/download/journalofexpedit00ford/journalofexpedit00ford_scandata.xml")
+
+    def test_v3_video_links(self):
+        resp = self.test_app.get("/iiif/3/DuckandC1951/manifest.json?recache=true")
+        self.assertEqual(resp.status_code, 200)
+        manifest = resp.json    
+
+        self.assertTrue('rendering' in manifest, "Expected rendering in Manifest")
+        renderingMap = self.convertListToHash(manifest['rendering'])
+        seeAlsoMap = self.convertListToHash(manifest['seeAlso'])
+        self.assertTrue("Unknown" not in renderingMap and "Unknown" not in seeAlsoMap, "Found Unknown in rendering or seeAlso where it shouldn't be.")
+
+        # SubRip - rendering
+        self.checkLink(renderingMap, "rendering", "SubRip", "https://archive.org/download/DuckandC1951/DuckandC1951.asr.srt")
+		# Web Video Text Tracks - rendering
+        self.checkLink(renderingMap, "rendering", "Web Video Text Tracks", "https://archive.org/download/DuckandC1951/DuckandC1951.asr.vtt")
+		# Archive BitTorrent - rendering
+        self.checkLink(renderingMap, "rendering", "Archive BitTorrent", "https://archive.org/download/DuckandC1951/DuckandC1951_archive.torrent")
+		# Intermediate ASR JSON - rendering
+        self.checkLink(renderingMap, "rendering", "Intermediate ASR JSON", "https://archive.org/download/DuckandC1951/DuckandC1951_intermediate_asr.json")
+        # Whisper ASR JSON
+        self.checkLink(renderingMap, "rendering", "Whisper ASR JSON", "https://archive.org/download/DuckandC1951/DuckandC1951_whisper_asr.json")
+
+		# Storj Upload Log - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "Storj Upload Log", "https://archive.org/download/DuckandC1951/DuckandC1951.storj-store.log")
+		# Storj Upload Trigger - seeAlso
+        self.checkLink(seeAlsoMap, "seeAlso", "Storj Upload Trigger", "https://archive.org/download/DuckandC1951/DuckandC1951.storj-store.trigger")
+
+		# Thumbnail - thumbnail
+        # 19 thumbs
+        self.assertEqual(len(manifest['thumbnail']), 19, f"Expected 19 thumbnails: {manifest['thumbnail']}")