From 53b3da1ea67b9d42c90095738bbdfccbfe86c4d9 Mon Sep 17 00:00:00 2001 From: Madison Swain-Bowden Date: Wed, 15 Mar 2023 14:12:28 -0700 Subject: [PATCH] Remove alternate image extraction from SMK, fix foreign landing URL (#1003) * Remove alternative images logic from SMK * Correctly encode foreign landing urls * Set the watermarked value to True temporarily for SMK --- .../providers/provider_api_scripts/smk.py | 39 ++-- .../resources/smk/expected_image_data_hq.json | 8 - .../smk/expected_image_data_legacy.json | 8 - .../smk/expected_image_data_partial.json | 10 -- .../smk/item_with_alternative_images.json | 168 ------------------ .../provider_api_scripts/test_smk.py | 30 ++-- 6 files changed, 30 insertions(+), 233 deletions(-) delete mode 100644 tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_partial.json delete mode 100644 tests/dags/providers/provider_api_scripts/resources/smk/item_with_alternative_images.json diff --git a/openverse_catalog/dags/providers/provider_api_scripts/smk.py b/openverse_catalog/dags/providers/provider_api_scripts/smk.py index 9e18635d6..19e72e846 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/smk.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/smk.py @@ -8,6 +8,7 @@ Notes: https://www.smk.dk/en/article/smk-api/ """ import logging +import urllib.parse from common import constants from common.licenses import get_license_info @@ -56,7 +57,9 @@ def _get_foreign_landing_url(item) -> str | None: "foreign_landing_url." ) return - return f"https://open.smk.dk/en/artwork/image/{object_num}" + # Occasionally, the object number will have a space in it. for these cases we + # need to urlencode it. + return f"https://open.smk.dk/en/artwork/image/{urllib.parse.quote(object_num)}" @staticmethod def _get_image_url(image_iiif_id: str, image_size=2048): @@ -113,31 +116,11 @@ def _get_images(item: dict) -> list: } ) - alternative_images = item.get("alternative_images") - if type(alternative_images) == list: - for alt_img in alternative_images: - if type(alt_img) == dict: - iiif_id = alt_img.get("iiif_id") - if iiif_id is None: - # The API for alternative images does not include the - # 'id', so we must skip if `iiif_id` is not present. - continue - image_url = SmkDataIngester._get_image_url(iiif_id) - thumbnail_url = alt_img.get("thumbnail") - height = alt_img.get("height") - width = alt_img.get("width") - filesize = alt_img.get("image_size") or alt_img.get("size") - - images.append( - { - "id": iiif_id, - "image_url": image_url, - "thumbnail_url": thumbnail_url, - "height": height, - "width": width, - "filesize": filesize, - } - ) + # We used to get additional images from the `alternative_images` field, + # but we found these to be either redundant, duplicates, or lower quality + # versions. See https://github.com/WordPress/openverse-catalog/issues/875 + # for the full investigation & discussion + return images @staticmethod @@ -176,6 +159,10 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: "width": img.get("width"), "filesize": img.get("filesize"), "meta_data": self._get_metadata(data), + # FIXME: USED AS A SENTINEL FOR WHICH FILES TO DELETE AFTER A FULL + # RUN. THIS SHOULD BE REMOVED AS SOON AS A RUN IS COMPLETE AND THE + # FILES MARKED AS WATERMARKED=False ARE DELETED. + "watermarked": True, } ) return images diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_hq.json b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_hq.json index d595e1ba9..66f501bb5 100644 --- a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_hq.json +++ b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_hq.json @@ -6,13 +6,5 @@ "image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/full/!2048,/0/default.jpg", "thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/2227ms627_KKSgb6458.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg", "width": 3887 - }, - { - "filesize": 19269857, - "height": 1576, - "id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2", - "image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg", - "thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg", - "width": 4073 } ] diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_legacy.json b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_legacy.json index 47f2c9c39..7fad0db3f 100644 --- a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_legacy.json +++ b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_legacy.json @@ -6,13 +6,5 @@ "image_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg", "thumbnail_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg", "width": 3887 - }, - { - "filesize": 19269857, - "height": 1576, - "id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2", - "image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg", - "thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg", - "width": 4073 } ] diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_partial.json b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_partial.json deleted file mode 100644 index 9d3506434..000000000 --- a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_partial.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "filesize": 19269857, - "height": 1576, - "id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2", - "image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg", - "thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg", - "width": 4073 - } -] diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/item_with_alternative_images.json b/tests/dags/providers/provider_api_scripts/resources/smk/item_with_alternative_images.json deleted file mode 100644 index e68e8a309..000000000 --- a/tests/dags/providers/provider_api_scripts/resources/smk/item_with_alternative_images.json +++ /dev/null @@ -1,168 +0,0 @@ -{ - "acquisition_date": "1787-01-01T00:00:00Z", - "acquisition_date_precision": "1887-12-31", - "alternative_images": [ - { - "height": 3524, - "iiif_id": "https://iip.smk.dk/iiif/jp2/1c18df916_kksgb7575_001.tif.jp2", - "iiif_info": "https://iip.smk.dk/iiif/jp2/1c18df916_kksgb7575_001.tif.jp2/info.json", - "mime_type": "image/tiff", - "native": "https://iip.smk.dk/iiif/jp2/1c18df916_kksgb7575_001.tif.jp2/full/full/0/native.jpg", - "orientation": "portrait", - "size": 11347216.551724138, - "thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/1c18df916_kksgb7575_001.tif.jp2/full/!1024,/0/default.jpg", - "width": 3110 - }, - { - "cropped": ["true"], - "height": 3142, - "iiif_id": "https://iip.smk.dk/iiif/jp2/p8418r38n_kksgb7575_-_001.tif.reconstructed.tif.jp2", - "iiif_info": "https://iip.smk.dk/iiif/jp2/p8418r38n_kksgb7575_-_001.tif.reconstructed.tif.jp2/info.json", - "mime_type": "image/tiff", - "native": "https://iip.smk.dk/iiif/jp2/p8418r38n_kksgb7575_-_001.tif.reconstructed.tif.jp2/full/full/0/native.jpg", - "orientation": "portrait", - "size": 7452243.448275862, - "thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/p8418r38n_kksgb7575_-_001.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg", - "width": 2302 - } - ], - "artist": ["Gheyn, Jacques II de"], - "brightness": 6.35253, - "collection": ["Gammel bestand"], - "colors": ["#888888", "#ffe7c9", "#bed69f", "#382510", "#777777"], - "colortemp": 3.8521814, - "content_person": ["Brahe, Tycho"], - "content_person_full": [ - { - "forename": "Tycho", - "full_name": "Tycho Brahe", - "gender": "UNKNOWN", - "name": "Brahe, Tycho", - "surname": "Brahe" - } - ], - "contrast": 4.1983867, - "created": "2020-03-21T08:32:04Z", - "dimensions": [ - { - "notes": "186 x 134 mm", - "part": "bladmaal", - "type": "h\u00f8jde", - "unit": "milimeter", - "value": "187" - }, - { - "notes": "186 x 134 mm", - "part": "bladmaal", - "type": "bredde", - "unit": "milimeter", - "value": "142" - } - ], - "documentation": [ - { - "author": "Jan Piet Filedt Kok", - "notes": "238 II", - "shelfmark": "k2000-158", - "title": "The New Hollstein Dutch and Flemish etchings, engravings and woodcuts 1450-1700, The de Gheyn family, part I-II", - "year_of_publication": "2000" - }, - { - "author": "Poul Grinder-Hansen", - "notes": "omt. og afb. p. 30 fig. 6", - "shelfmark": "C 35560", - "title": "Tycho Brahes verden: Danmark i Europa 1550-1600", - "year_of_publication": "2006" - } - ], - "enrichment_url": "https://enrichment.api.smk.dk/api/enrichment/KKSgb7575", - "entropy": 9.792481, - "frontend_url": "https://open.smk.dk/artwork/image/kksgb7575", - "has_3d_file": false, - "has_image": true, - "has_text": true, - "id": "1170001143_object", - "iiif_manifest": "https://api.smk.dk/api/v1/iiif/manifest/?id=kksgb7575", - "image_cropped": true, - "image_height": 3026, - "image_hq": true, - "image_iiif_id": "https://iip.smk.dk/iiif/jp2/t722hd324_KKSgb7575_crop.tif.jp2", - "image_iiif_info": "https://iip.smk.dk/iiif/jp2/t722hd324_KKSgb7575_crop.tif.jp2/info.json", - "image_mime_type": "image/tiff", - "image_native": "https://api.smk.dk/api/v1/download/W3siaW1nX3VybCI6Imh0dHBzOi8vaWlwLnNtay5kay9paWlmL2pwMi90NzIyaGQzMjRfS0tTZ2I3NTc1X2Nyb3AudGlmLmpwMi9mdWxsL2Z1bGwvMC9uYXRpdmUuanBnIiwicHVibGljX2RvbWFpbiI6dHJ1ZSwib2JqZWN0X251bWJlciI6IktLU2diNzU3NSIsIm51bSI6IiJ9XQ==/KKSgb7575.jpg", - "image_orientation": "portrait", - "image_size": 6706084, - "image_thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/t722hd324_KKSgb7575_crop.tif.jp2/full/!1024,/0/default.jpg", - "image_width": 2183, - "inscriptions": [ - { - "content": "\"Effigies Tychonis Brahe ottonidis dani/dni de knudstrup et arcis uranienburg in/insula hellisponti danici hvenna fundatoris/instrumentorumos astronomicorum in eadem/dispositarim inventoris et structoris/aetatis su\u00e6 anno 40.anno dni 1586.compl/\"", - "description": "Kobberstik", - "language": "Latin", - "type": "Tekst" - }, - { - "content": "Grafikers og udgivers navn anf\u00f8rt i trykpladen", - "type": "Signatur" - } - ], - "modified": "2022-06-30T07:24:58Z", - "number_of_parts": 1, - "object_names": [ - { - "name": "Kobberstik" - } - ], - "object_number": "KKSgb7575", - "object_url": "https://api.smk.dk/api/v1/art/?object_number=kksgb7575", - "on_display": false, - "production": [ - { - "creator": "Gheyn, Jacques II de", - "creator_date_of_birth": "1565-01-01T00:00:00.000Z", - "creator_date_of_death": "1629-01-01T00:00:00.000Z", - "creator_gender": "MALE", - "creator_lref": "22711_person", - "creator_nationality": "Flamsk" - }, - { - "creator": "Sadeler, Marcus", - "creator_date_of_birth": "1614-04-01T00:00:00.000Z", - "creator_date_of_death": "1650-01-01T00:00:00.000Z", - "creator_forename": "Marcus", - "creator_gender": "UNKNOWN", - "creator_lref": "30171_person", - "creator_nationality": "Tysk", - "creator_role": "Udgiver", - "creator_surname": "Sadeler" - } - ], - "production_date": [ - { - "end": "1594-12-31T00:00:00.000Z", - "period": "1593-1594", - "start": "1593-01-01T00:00:00.000Z" - } - ], - "production_dates_notes": ["V\u00e6rkdatering: ca. 1595"], - "public_domain": true, - "related_objects": [ - { - "reference": "KKS4530", - "title": "Tycho Brahe" - } - ], - "responsible_department": "Den Kongelige Kobberstiksamling", - "rights": "https://creativecommons.org/publicdomain/zero/1.0/", - "saturation": 2.495402, - "similar_images_url": "https://similar.api.smk.dk/similar/?object_number=KKSgb7575", - "suggested_bg_color": ["#888888"], - "techniques": ["Kobberstik"], - "titles": [ - { - "language": "dansk", - "title": "Tycho Brahe", - "type": "museumstitel" - } - ] -} diff --git a/tests/dags/providers/provider_api_scripts/test_smk.py b/tests/dags/providers/provider_api_scripts/test_smk.py index 9c8e21cf5..4d8f0a721 100644 --- a/tests/dags/providers/provider_api_scripts/test_smk.py +++ b/tests/dags/providers/provider_api_scripts/test_smk.py @@ -1,6 +1,7 @@ import json from pathlib import Path +import pytest from common.licenses import LicenseInfo from providers.provider_api_scripts.smk import SmkDataIngester @@ -57,10 +58,20 @@ def test_get_next_query_params_increments_offset(): assert actual_param == expected_param -def test__get_foreign_landing_url(): - item = {"object_number": "KKSgb22423"} +@pytest.mark.parametrize( + "object_number, expected_url", + [ + ("KKSgb22423", "https://open.smk.dk/en/artwork/image/KKSgb22423"), + ( + "KKSgb22423 version 1", + "https://open.smk.dk/en/artwork/image/KKSgb22423%20version%201", + ), + ("KSMB 25 106.5", "https://open.smk.dk/en/artwork/image/KSMB%2025%20106.5"), + ], +) +def test__get_foreign_landing_url(object_number, expected_url): + item = {"object_number": object_number} actual_url = smk._get_foreign_landing_url(item) - expected_url = "https://open.smk.dk/en/artwork/image/KKSgb22423" assert actual_url == expected_url @@ -100,12 +111,12 @@ def test__get_images_legacy(): def test__get_images_partial(): + # Left in as a regression test for + # https://github.com/WordPress/openverse-catalog/issues/875 item = _get_resource_json("image_data_partial.json") - expected_images_data = _get_resource_json("expected_image_data_partial.json") - actual_images_data = smk._get_images(item) - assert actual_images_data == expected_images_data + assert actual_images_data == [] def test__get_metadata(): @@ -125,10 +136,3 @@ def test_get_record_data_returns_main_image(): images = smk.get_record_data(item) assert len(images) == 1 - - -def test_get_record_data_returns_alternative_images(): - item = _get_resource_json("item_with_alternative_images.json") - images = smk.get_record_data(item) - - assert len(images) == 3