Adding support for autogenerated vtt files

internetarchive · Apr 26, 2024 · b45bd71 · b45bd71
1 parent 23e70c4
commit b45bd71
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 0 deletions.
diff --git a/iiify/resolver.py b/iiify/resolver.py
@@ -506,6 +506,7 @@ def create_manifest3(identifier, domain=None, page=None):
         # sort the files into originals and derivatives, splitting the derivatives into buckets based on the original
         originals = []
         derivatives = {}
+        vttfiles = {}
         for f in metadata['files']:
             if f['source'] == 'derivative':
                 if f['original'] in derivatives:
@@ -514,6 +515,14 @@ def create_manifest3(identifier, domain=None, page=None):
                     derivatives[f['original']] = {f['format']: f}
             elif f['source'] == 'original':
                 originals.append(f)
+
+            if f['format'] == 'Web Video Text Tracks':
+                # Example: 34C3_-_International_Image_Interoperability_Framework_IIIF_Kulturinstitutionen_schaffen_interop-SvH4fbjOT0A.autogenerated.vtt
+                sourceFilename = f['name'].replace('.autogenerated.vtt', '')
+                # Example: cruz-test.en.vtt
+                sourceFilename = sourceFilename.replace('[a-z][a-z].vtt', '')
+
+                vttfiles[sourceFilename] = [f]    
 
         # create the canvases for each original
         for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]:
@@ -522,6 +531,32 @@ def create_manifest3(identifier, domain=None, page=None):
             c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas"
             c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width']))
 
+            # Add vtt if present
+            if vttfiles and normalised_id in vttfiles:
+                vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt"
+
+                vttNo = 1
+                for vttFile in vttfiles[normalised_id]:
+                    vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", 
+                                               motivation="supplementing", 
+                                               target=c.id, 
+                                               anno_page_id=vttAPId,
+                                               body={"id": f"https://archive.org/download/{identifier}/{vttFile['name']}",
+                                                     "type": "Text",
+                                                    "format": "text/vtt",
+                                                    })
+                    # add label and language
+                    if vttFile['name'].endswith("autogenerated.vtt"):
+                        vtAnno.body.label = { 'en': ['autogenerated']}
+                    else:
+                        # Assume langauge
+                        splitName = vttFile['name'].split(".")
+                        lang = splitName[-2]
+                        vtAnno.body.add_label(lang, language=lang)
+                        vtAnno.body.language = lang
+
+                    vttNo += 1
+
             # create intermediary objects
             ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page")
             anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id)

diff --git a/tests/test_video.py b/tests/test_video.py
@@ -22,5 +22,30 @@ def test_v3_h264_MPEG4_OGG_Theora(self):
         self.assertEqual("h.264 MPEG4".lower() in resp.text.lower(), True, f"Expected the string 'h.264 MPEG4'")
         self.assertEqual("OGG Theora".lower() in resp.text.lower(), True, f"Expected the string 'OGG Theora'")
 
+    def test_vtt_autogenerated(self):
+        resp = self.test_app.get("/iiif/3/youtube-SvH4fbjOT0A/manifest.json?recache=true")
+        self.assertEqual(resp.status_code, 200)
+        manifest = resp.json
+
+        self.assertEqual(len(manifest['items']),1,f"Expected 1 canvas but got: {len(manifest['items'])}")
+        self.assertTrue('annotations' in manifest['items'][0], "Expected annotations in manifest")
+        self.assertTrue(isinstance(manifest['items'][0]['annotations'], list), "Expected annotations to be a list")
+        self.assertEqual(len(manifest['items'][0]['annotations']), 1, "Expected 1 item in annotations")
+        annotationPage = manifest['items'][0]['annotations'][0]
+        self.assertEqual(annotationPage['type'], 'AnnotationPage', "Expected annotations to contain annotation page")
+
+        self.assertTrue('items' in annotationPage and isinstance(annotationPage['items'],list) and len(annotationPage['items']) == 1, f"Expected annotation page to contain a list of items which contains 1 item. Found {annotationPage['items']}")
+        annotation = annotationPage['items'][0]
+        self.assertEqual(annotation['type'], 'Annotation', "Expected annotationPage to contain annotations")
+        self.assertEqual(annotation['motivation'], 'supplementing', "Expected annotation to have the supplementing annotation")
+        self.assertTrue('body' in annotation, "Expected annotation to have a body")
+        body = annotation['body']
+        self.assertEqual(body['type'],'Text', "Expected body to have a type text")
+        self.assertEqual(body['format'],'text/vtt', "Expected body to have a type text")
+        self.assertEqual(body['label']['en'][0], "autogenerated", "Expected VTT file to have the label autogenerated")
+        self.assertFalse("language" in body, "We don't know the language for this item so there shouldn't be a language specified")
+        self.assertEqual(body['id'], "https://archive.org/download/youtube-SvH4fbjOT0A/34C3_-_International_Image_Interoperability_Framework_IIIF_Kulturinstitutionen_schaffen_interop-SvH4fbjOT0A.autogenerated.vtt","Unexpected URL for the VTT file")
+
+
 if __name__ == '__main__':
     unittest.main()