Merge branch 'spring-2024' into issue-48

internetarchive · May 23, 2024 · 37587f4 · 37587f4
2 parents 62f4107 + f814775
commit 37587f4
Show file tree

Hide file tree

Showing 8 changed files with 428 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -39,6 +39,11 @@ Unit tests are in the `tests` folder and can be run with:
 python -m unittest discover -s tests
 ```
 
+Run single test:
+```
+python -m unittest tests.test_video.TestVideo.test_vtt_autogenerated
+```
+
 Retrieve large.jpg as 800px wide JPEG
 * http://127.0.0.1:8080/iiif/large.jpg/full/800,/0/default.jpg 
 

diff --git a/iiify/configs/__init__.py b/iiify/configs/__init__.py
@@ -14,6 +14,7 @@
 import sys
 import types
 import configparser
+import json
 
 path = os.path.dirname(os.path.realpath(__file__))
 approot = os.path.abspath(os.path.join(path, os.pardir))
@@ -75,3 +76,6 @@ def getdef(self, section, option, default_value):
     "long": 432000,  # 5 days
     "longest": 2592000  # 30 days
 }
+
+with open('%s/links.json' % path, 'r') as file:
+    LINKS = json.load(file)
diff --git a/iiify/configs/links.json b/iiify/configs/links.json
@@ -0,0 +1,137 @@
+{
+    "Animated GIF": {
+        "field": "rendering",
+        "type": "Image",
+        "format": "image/gif"
+    },
+    "Text PDF": {
+	    "field": "rendering",
+        "type": "Text",
+        "format": "application/pdf"
+    },
+    "Abbyy GZ": {
+        "field": "rendering",
+        "type": "Dataset",
+        "format": "application/gzip"
+    },
+    "Archive BitTorrent": {
+        "field": "rendering",
+        "type": "Dataset",
+        "format": "application/x-bittorrent"
+    },
+    "Grayscale PDF": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/pdf"
+    },
+    "chOCR": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/gzip"
+    },
+    "DjVuTXT": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "text/plain"
+    },
+    "Djvu XML": {
+        "field": "rendering",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "hOCR": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "text/html"
+    },
+    "Single Page Processed JP2 ZIP": {
+        "field": "rendering",
+        "type": "Image",
+        "format": "application/zip"
+    },
+    "OCR Search Text": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/gzip"
+    },
+    "Single Page Original JP2 Tar": {
+        "field": "rendering",
+        "type": "Image",
+        "format": "application/x-tar"
+    },
+    "DjVu": {
+        "field": "rendering",
+        "type": "Image",
+        "format": "image/vnd.djvu"
+    },
+    "Cloth Cover Detection Log": {
+        "field": "seeAlso",
+        "type": "Text",
+        "format": "text/plain"
+    },
+    "Dublin Core": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "OCR Page Index": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/json"
+    },
+    "MARC": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "MARC Binary": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/marc"
+    },
+    "MARC Source": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "Page Numbers JSON": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/json"
+    },
+    "Scandata": {
+        "field": "seeAlso",
+        "type": "Dataset",
+        "format": "application/xml"
+    },
+    "SubRip": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "text/plain"
+    },
+    "Web Video Text Tracks": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "text/vtt"
+    },
+    "Intermediate ASR JSON": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/json"
+    },
+    "Whisper ASR JSON": {
+        "field": "rendering",
+        "type": "Text",
+        "format": "application/json"
+    },
+    "Storj Upload Log": {
+        "field": "seeAlso",
+        "type": "Text",
+        "format": "text/plain"
+    },
+    "Storj Upload Trigger": {
+        "field": "seeAlso",
+        "type": "Text",
+        "format": "text/plain"
+    }
+}
diff --git a/iiify/resolver.py b/iiify/resolver.py
@@ -3,8 +3,9 @@
 import os
 import requests
 from iiif2 import iiif, web
-from .configs import options, cors, approot, cache_root, media_root, apiurl
+from .configs import options, cors, approot, cache_root, media_root, apiurl, LINKS
 from iiif_prezi3 import Manifest, config, Annotation, AnnotationPage,AnnotationPageRef, Canvas, Manifest, ResourceItem, ServiceItem, Choice, Collection, ManifestRef, CollectionRef
+
 from urllib.parse import urlparse, parse_qs, quote
 import json
 import math 
@@ -13,7 +14,7 @@
 
 IMG_CTX = 'http://iiif.io/api/image/2/context.json'
 PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json'
-ARCHIVE = 'http://archive.org'
+ARCHIVE = 'https://archive.org'
 IMG_SRV = 'https://iiif.archive.org/image/iiif'
 METADATA_FIELDS = ("title", "volume", "publisher", "subject", "date", "contributor", "creator")
 bookdata = 'http://%s/BookReader/BookReaderJSON.php'
@@ -385,7 +386,66 @@ def addMetadata(item, identifier, metadata, collection=False):
 
     item.metadata = manifest_metadata
 
+def addSeeAlso(manifest, identifier, files):
+
+    manifest.seeAlso = [
+        {"id": f"{ARCHIVE}/metadata/{identifier}",
+         "type": "Metadata",
+         "label": {"en": ["Item Metadata"]},
+         "format": "application/json"}
+    ]
+
+    # Type format from IA Metadata -> Type description in IIIF
+    SEEALSO_TYPES = {
+        "Abbyy GZ": "OCR Data",
+        "Abbyy XML": "OCR Data",
+        "Djvu XML": "OCR Data",
+        "Scandata": "OCR Data",
+        "Archive BitTorrent": "Torrent",
+        "Metadata": "Metadata",
+    }
+
+    for file in files:
+        if file['format'] in LINKS and LINKS[file['format']]['field'] == 'seeAlso':
+            seeAlso = LINKS[file['format']]
+            manifest.seeAlso.append(
+                {"id": f"{ARCHIVE}/download/{identifier}/{file['name']}",
+                 "type": seeAlso['type'],
+                 "label": {"en": [file["format"]]},
+                 "format": seeAlso['format']
+                 })
+
+
+def addRendering(manifest, identifier, files):
+    manifest.rendering = []
+
+    for file in files:
+        if file['format'] in LINKS and LINKS[file['format']]['field'] == 'rendering':
+            rendering = LINKS[file['format']]
+            manifest.rendering.append(
+                {"id": f"{ARCHIVE}/download/{identifier}/{file['name']}",
+                 "type": rendering['type'],
+                 "label": {"en": [file["format"]]},
+                 "format": rendering['format']
+                 })
+
+def addThumbnails(manifest, identifier, files):
+    thumbnails = []
+
+    for file in files:
+        if file['format'] == "Thumbnail":
+            mimetype = "image/jpeg"
+            if file['name'].endswith('.png'):
+                mimetype = "image/png"
+
+            thumbnails.append({
+                "id": f"{ARCHIVE}/download/{identifier}/{file['name']}",
+                "type": "Image",
+                "format": mimetype,
+            })
 
+    if thumbnails:
+        manifest.thumbnail = thumbnails
 
 def create_manifest3(identifier, domain=None, page=None):
     # Get item metadata
@@ -401,6 +461,9 @@ def create_manifest3(identifier, domain=None, page=None):
     manifest = Manifest(id=f"{uri}/manifest.json", label=metadata["metadata"]["title"])
 
     addMetadata(manifest, identifier, metadata['metadata'])
+    addSeeAlso(manifest, identifier, metadata['files'])
+    addRendering(manifest, identifier, metadata['files'])
+    addThumbnails(manifest, identifier, metadata['files'])
 
     if mediatype == 'texts':
         # Get bookreader metadata (mostly for filenames and height / width of image)
@@ -570,6 +633,7 @@ def create_manifest3(identifier, domain=None, page=None):
         # sort the files into originals and derivatives, splitting the derivatives into buckets based on the original
         originals = []
         derivatives = {}
+        vttfiles = {}
         for f in metadata['files']:
             if f['source'] == 'derivative':
                 if f['original'] in derivatives:
@@ -578,6 +642,14 @@ def create_manifest3(identifier, domain=None, page=None):
                     derivatives[f['original']] = {f['format']: f}
             elif f['source'] == 'original':
                 originals.append(f)
+
+            if f['format'] == 'Web Video Text Tracks':
+                # Example: cruz-test.en.vtt and 34C3_-_International_Image_Interoperability_Framework_IIIF_Kulturinstitutionen_schaffen_interop-SvH4fbjOT0A.autogenerated.vtt
+                sourceFilename = re.sub('\.[a-zA-H-]*\.vtt', '', f['name'])
+                if sourceFilename not in vttfiles:
+                    vttfiles[sourceFilename] = []    
+
+                vttfiles[sourceFilename].append(f)    
 
         # create the canvases for each original
         for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
@@ -586,6 +658,32 @@ def create_manifest3(identifier, domain=None, page=None):
             c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
             c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))
 
+            # Add vtt if present
+            if vttfiles and normalised_id in vttfiles:
+                vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
+
+                vttNo = 1
+                for vttFile in vttfiles[normalised_id]:
+                    vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", 
+                                               motivation="supplementing", 
+                                               target=c.id, 
+                                               anno_page_id=vttAPId,
+                                               body={"id": f"{domain}resource/{identifier}/{vttFile['name']}",
+                                                     "type": "Text",
+                                                    "format": "text/vtt",
+                                                    })
+                    # add label and language
+                    if vttFile['name'].endswith("autogenerated.vtt"):
+                        vtAnno.body.label = { 'en': ['autogenerated']}
+                    else:
+                        # Assume language
+                        splitName = vttFile['name'].split(".")
+                        lang = splitName[-2]
+                        vtAnno.body.add_label(lang, language="none")
+                        vtAnno.body.language = lang
+
+                    vttNo += 1
+
             # create intermediary objects
             ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
             anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)
@@ -745,7 +843,6 @@ def ia_resolver(identifier):
 
 def cantaloupe_resolver(identifier):
     """Resolves an existing Image Service identifier to what it should be with the new Cantaloupe setup"""
-
     leaf = None
     if "$" in identifier:
         identifier, leaf = identifier.split("$", 1)
@@ -757,7 +854,6 @@ def cantaloupe_resolver(identifier):
 
     mediatype = metadata['metadata']['mediatype'].lower()
     files = metadata['files']
-
     if mediatype == "image":
         # single image file - find the filename
 
@@ -808,11 +904,12 @@ def cantaloupe_resolver(identifier):
 
         #filename = next(f for f in files if f['source'].lower() == 'derivative' \
         #                and f['name'].endswith('_jp2.zip'))['name']
+        print("end of logic - filename:", filename)
         if filename:
             dirpath = filename[:-4]
             filepath = f"{fileIdentifier}_{leaf.zfill(4)}{extension}"
             return f"{identifier}%2f{filename}%2f{dirpath}%2f{filepath}"
 
-    # print (f'images not found for {identifier}')
-    # for f in files:
-    #     print (f"source: {f['source'].lower()} name: {f['name']} and {f['source'].lower() == 'derivative'} {f['name'].endswith('_jp2.zip')}")
+ #   print (f'images not found for {identifier}')
+ #   for f in files:
+ #       print (f"source: {f['source'].lower()} name: {f['name']} and {f['source'].lower() == 'derivative'} {f['name'].endswith('_jp2.zip')}")
diff --git a/nginx-vhost.conf b/nginx-vhost.conf
@@ -37,4 +37,11 @@ server {
         # Reverse proxy with the variables captured above
         proxy_pass https://cantaloupe.prod.archive.org/iiif/$1/$2;
     }
+
+    location /iiif/resource/ {
+        add_header 'Access-Control-Allow-Origin' '*' always;
+        add_header 'Access-Control-Allow-Methods' 'GET, HEAD, POST, PUT, PATCH, DELETE' always;
+        # https://archive.org/download/cruz-test/cruz-test.af.vtt
+        proxy_pass https://archive.org/download/;
+    }
 }