From ee2e996c317d2767719f8a78a3e7dcb0059bc875 Mon Sep 17 00:00:00 2001 From: "bosteen@getty.edu" Date: Thu, 19 Sep 2024 14:40:35 -0700 Subject: [PATCH 1/9] Adding a test for the idprefixer with RDF prefixed 'id' URIs --- source/web-service/tests/conftest.py | 30 +++++++++++++++++++ .../web-service/tests/test_routes_records.py | 20 +++++++++++++ 2 files changed, 50 insertions(+) diff --git a/source/web-service/tests/conftest.py b/source/web-service/tests/conftest.py index 82fe25b4..c41c9c84 100644 --- a/source/web-service/tests/conftest.py +++ b/source/web-service/tests/conftest.py @@ -180,6 +180,36 @@ def _sample_record(): return _sample_record +@pytest.fixture +def sample_rdfrecord_with_context(test_db): + def _sample_record(): + record = Record( + entity_id=str(uuid4()), + entity_type="Object", + datetime_created=datetime(2020, 11, 22, 13, 2, 53), + datetime_updated=datetime(2020, 12, 18, 11, 22, 7), + data={ + "@context": { + "dc": "http://purl.org/dc/elements/1.1/", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "_label": {"@id": "http://www.w3.org/2000/01/rdf-schema#label"}, + }, + "@id": "rdfsample1", + "rdfs:seeAlso": [ + { + "_label": "This is a meaningless bit of data to test if the idprefixer leaves the id alone", + "@id": "dc:description", + }, + ], + }, + ) + test_db.session.add(record) + test_db.session.commit() + return record + + return _sample_record + + @pytest.fixture def linguisticobject(): def _generator(name, id): diff --git a/source/web-service/tests/test_routes_records.py b/source/web-service/tests/test_routes_records.py index 9b9b35db..229733b0 100644 --- a/source/web-service/tests/test_routes_records.py +++ b/source/web-service/tests/test_routes_records.py @@ -148,6 +148,26 @@ def test_prefix_record_ids_top( assert "LOD Gateway" in response.headers["Server"] assert json.loads(response.data) == data + def test_prefix_record_w_rdf_prefixes( + self, sample_rdfrecord_with_context, client, namespace, current_app + ): + current_app.config["PREFIX_RECORD_IDS"] = "RECURSIVE" + + response = client.get( + f"/{namespace}/{sample_rdfrecord_with_context['record'].entity_id}" + ) + + assert response.status_code == 200 + + data = response.get_json() + print(data) + + # make sure the main relative id has been prefixed + assert not data["@id"].startswith("rdfsample1") + + # make sure that the RDF prefixed IDs have been left alone. + assert data["rdfs:seeAlso"][0]["@id"] == "dc:description" + def test_prefix_record_ids_none( self, sample_data_with_ids, client, namespace, current_app ): From d4e807c54983acaf2e36976e8aa90da30ba55bad Mon Sep 17 00:00:00 2001 From: "bosteen@getty.edu" Date: Thu, 19 Sep 2024 14:41:16 -0700 Subject: [PATCH 2/9] Adding a step to the idprefixer to get the RDF prefixes if @context --- .../web-service/flaskapp/base_graph_utils.py | 16 +++++++++++ source/web-service/flaskapp/routes/records.py | 5 ++++ .../flaskapp/storage_utilities/graph.py | 27 ++++++++++++++----- source/web-service/flaskapp/utilities.py | 12 ++++++--- 4 files changed, 50 insertions(+), 10 deletions(-) diff --git a/source/web-service/flaskapp/base_graph_utils.py b/source/web-service/flaskapp/base_graph_utils.py index 9beb32f7..58038568 100644 --- a/source/web-service/flaskapp/base_graph_utils.py +++ b/source/web-service/flaskapp/base_graph_utils.py @@ -131,3 +131,19 @@ def base_graph_filter(basegraphobj, fqdn_id): "Failed to access record table - has the initial flask db upgrade been run?" ) return set() + + +def get_url_prefixes_from_context(context_json): + # Get the list of mapped prefixes (eg 'rdfs') from the context + # TODO - investigate caching this function as a later feature PR + # (only a handful of contexts are in use and the response will be the same.) + proc = jsonld.JsonLdProcessor() + options = { + "isFrame": False, + "keepFreeFloatingNodes": False, + "documentLoader": current_app.config["RDF_DOCLOADER"], + "extractAllScripts": False, + "processingMode": "json-ld-1.1", + } + mappings = proc.process_context({"mappings": {}}, context_json, options)["mappings"] + return {x for x in mappings if mappings[x]["_prefix"] == True} diff --git a/source/web-service/flaskapp/routes/records.py b/source/web-service/flaskapp/routes/records.py index f8c48e49..4d20c2b4 100644 --- a/source/web-service/flaskapp/routes/records.py +++ b/source/web-service/flaskapp/routes/records.py @@ -38,6 +38,7 @@ status_ok, ) from flaskapp.utilities import checksum_json, authenticate_bearer +from flaskapp.base_graph_utils import get_url_prefixes_from_context import time @@ -434,6 +435,9 @@ def entity_record(entity_id): # Assume that id/@id choice used in the data is the same as the top level attr = "@id" if "@id" in data else "id" + urlprefixes = None + if "@context" in data: + urlprefixes = get_url_prefixes_from_context(data["@context"]) data = containerRecursiveCallback( data=data, @@ -441,6 +445,7 @@ def entity_record(entity_id): callback=idPrefixer, prefix=idPrefix, recursive=recursive, + urlprefixes=urlprefixes, ) current_app.logger.debug( diff --git a/source/web-service/flaskapp/storage_utilities/graph.py b/source/web-service/flaskapp/storage_utilities/graph.py index 40445b80..b260dd85 100644 --- a/source/web-service/flaskapp/storage_utilities/graph.py +++ b/source/web-service/flaskapp/storage_utilities/graph.py @@ -1,27 +1,24 @@ import requests import json import time -import re -from flask import current_app, request, abort, jsonify +from flask import current_app from flaskapp.storage_utilities.record import get_record from flaskapp.utilities import ( - Event, containerRecursiveCallback, idPrefixer, full_stack_trace, - is_quads, quads_to_triples, graph_filter, ) -import rdflib +import traceback from pyld import jsonld from pyld.jsonld import JsonLdError -from flaskapp.base_graph_utils import base_graph_filter +from flaskapp.base_graph_utils import base_graph_filter, get_url_prefixes_from_context from flaskapp.graph_prefix_bindings import get_bound_graph @@ -39,9 +36,18 @@ def __init__( def inflate_relative_uris(data, id_attr="id"): idPrefix = current_app.config["RDFidPrefix"] + urlprefixes = [] + + if "@context" in data: + # Get any context-added url prefixes: + urlprefixes = get_url_prefixes_from_context(data["@context"]) return containerRecursiveCallback( - data=data, attr=id_attr, callback=idPrefixer, prefix=idPrefix + data=data, + attr=id_attr, + callback=idPrefixer, + prefix=idPrefix, + urlprefixes=urlprefixes, ) @@ -85,6 +91,8 @@ def graph_expand(data, proc=None): tictoc = time.perf_counter() # PyLD expansion? or RDFLIB? + + current_app.logger.debug(f"Starting data to expand: '{data}'") if current_app.config["USE_PYLD_REFORMAT"] is True: current_app.logger.info(f"{json_ld_id} - expanding using PyLD") try: @@ -181,6 +189,11 @@ def graph_replace(graph_name, serialized_nt, update_endpoint): current_app.logger.debug( f"Filtering base triples ({len(current_app.config['RDF_FILTER_SET'])}) from graph n-triples" ) + current_app.logger.debug( + f"Incoming triples to filter: (" + + str(serialized_nt.split("\n")) + + ") from graph n-triples" + ) serialized_nt = graph_filter( serialized_nt, current_app.config["RDF_FILTER_SET"] ) diff --git a/source/web-service/flaskapp/utilities.py b/source/web-service/flaskapp/utilities.py index 40db74a1..05d0d400 100644 --- a/source/web-service/flaskapp/utilities.py +++ b/source/web-service/flaskapp/utilities.py @@ -123,6 +123,7 @@ def containerRecursiveCallback( find=None, replace=None, prefix=None, + urlprefixes=None, suffix=None, callback=None, recursive=True, @@ -172,6 +173,7 @@ def generalModify(key, value, find=None, replace=None, prefix=None, suffix=None) prefix=prefix, suffix=suffix, callback=callback, + urlprefixes=urlprefixes, ) else: if (attr == None or attr == key) and isinstance(val, str): @@ -182,6 +184,7 @@ def generalModify(key, value, find=None, replace=None, prefix=None, suffix=None) replace=replace, prefix=prefix, suffix=suffix, + urlprefixes=urlprefixes, ) data[key] = val @@ -199,6 +202,7 @@ def generalModify(key, value, find=None, replace=None, prefix=None, suffix=None) prefix=prefix, suffix=suffix, callback=callback, + urlprefixes=urlprefixes, ) else: if (attr == None or attr == key) and isinstance(val, str): @@ -209,6 +213,7 @@ def generalModify(key, value, find=None, replace=None, prefix=None, suffix=None) replace=replace, prefix=prefix, suffix=suffix, + urlprefixes=urlprefixes, ) data[key] = val @@ -216,11 +221,12 @@ def generalModify(key, value, find=None, replace=None, prefix=None, suffix=None) return data -def idPrefixer(attr, value, prefix=None, **kwargs): +def idPrefixer(attr, value, prefix=None, urlprefixes=None, **kwargs): """Helper callback method to prefix non-prefixed JSON-LD document 'id' attributes""" - + if urlprefixes is None: + urlprefixes = set() # prefix any relative uri with the prefix - if value.split(":")[0] not in ALLOWED_SCHEMES and prefix: + if value.split(":")[0] not in (ALLOWED_SCHEMES.union(urlprefixes)) and prefix: return prefix + "/" + value return value From 3b0aab1c5ca4c7ae4d91f12eb444d572fd3abe34 Mon Sep 17 00:00:00 2001 From: "bosteen@getty.edu" Date: Thu, 19 Sep 2024 14:48:33 -0700 Subject: [PATCH 3/9] Adding fix for test --- source/web-service/tests/test_routes_records.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/source/web-service/tests/test_routes_records.py b/source/web-service/tests/test_routes_records.py index 229733b0..c03d7d75 100644 --- a/source/web-service/tests/test_routes_records.py +++ b/source/web-service/tests/test_routes_records.py @@ -153,9 +153,7 @@ def test_prefix_record_w_rdf_prefixes( ): current_app.config["PREFIX_RECORD_IDS"] = "RECURSIVE" - response = client.get( - f"/{namespace}/{sample_rdfrecord_with_context['record'].entity_id}" - ) + response = client.get(f"/{namespace}/{sample_rdfrecord_with_context.entity_id}") assert response.status_code == 200 From 3c1f1b95b130edc3c19eacb0a4f801ac6535da38 Mon Sep 17 00:00:00 2001 From: "bosteen@getty.edu" Date: Thu, 19 Sep 2024 14:52:56 -0700 Subject: [PATCH 4/9] Test tweak --- source/web-service/tests/test_routes_records.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/web-service/tests/test_routes_records.py b/source/web-service/tests/test_routes_records.py index c03d7d75..102979b0 100644 --- a/source/web-service/tests/test_routes_records.py +++ b/source/web-service/tests/test_routes_records.py @@ -149,11 +149,11 @@ def test_prefix_record_ids_top( assert json.loads(response.data) == data def test_prefix_record_w_rdf_prefixes( - self, sample_rdfrecord_with_context, client, namespace, current_app + self, sample_idprefixdata, client, namespace, current_app ): current_app.config["PREFIX_RECORD_IDS"] = "RECURSIVE" - response = client.get(f"/{namespace}/{sample_rdfrecord_with_context.entity_id}") + response = client.get(f"/{namespace}/{sample_idprefixdata.entity_id}") assert response.status_code == 200 From 57b75bf3b8a302ef2a9040ffd4ac152b3fe5d320 Mon Sep 17 00:00:00 2001 From: "bosteen@getty.edu" Date: Thu, 19 Sep 2024 14:59:00 -0700 Subject: [PATCH 5/9] Forgot to commit conftest.py --- source/web-service/tests/conftest.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/source/web-service/tests/conftest.py b/source/web-service/tests/conftest.py index c41c9c84..09cbf23a 100644 --- a/source/web-service/tests/conftest.py +++ b/source/web-service/tests/conftest.py @@ -180,6 +180,11 @@ def _sample_record(): return _sample_record +@pytest.fixture +def sample_idprefixdata(sample_rdfrecord_with_context): + return sample_rdfrecord_with_context() + + @pytest.fixture def sample_rdfrecord_with_context(test_db): def _sample_record(): From eae5387251c3d53319e505e4bf5e323bf132d77e Mon Sep 17 00:00:00 2001 From: "bosteen@getty.edu" Date: Thu, 19 Sep 2024 15:17:50 -0700 Subject: [PATCH 6/9] Adding prefix caching with a TTL --- source/web-service/flaskapp/__init__.py | 12 +++++++ .../web-service/flaskapp/base_graph_utils.py | 36 +++++++++++++++---- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/source/web-service/flaskapp/__init__.py b/source/web-service/flaskapp/__init__.py index fa632b07..b75f3f0d 100644 --- a/source/web-service/flaskapp/__init__.py +++ b/source/web-service/flaskapp/__init__.py @@ -92,6 +92,18 @@ def create_app(): # RDFidPrefix should not end with a / app.config["RDFidPrefix"][:-1] + # How long should the idprefixer keep cached lists of RDF prefixes from resolving + # contexts (default 12 hours) + app.config["CONTEXTPREFIX_TTL"] = 60 * 60 * 12 + try: + app.config["CONTEXTPREFIX_TTL"] = int( + environ.get("CONTEXTPREFIX_TTL", app.config["CONTEXTPREFIX_TTL"]) + ) + except ValueError: + app.logger.error( + "The value in the 'CONTEXTPREFIX_TTL' environment key was not an integer duration of seconds. Setting to {app.config['CONTEXTPREFIX_TTL']}" + ) + app.config["SQLALCHEMY_DATABASE_URI"] = environ["DATABASE"] app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False diff --git a/source/web-service/flaskapp/base_graph_utils.py b/source/web-service/flaskapp/base_graph_utils.py index 58038568..067d8883 100644 --- a/source/web-service/flaskapp/base_graph_utils.py +++ b/source/web-service/flaskapp/base_graph_utils.py @@ -1,11 +1,10 @@ -from flaskapp.models import db -from flaskapp.models.record import Record +from datetime import datetime +from functools import wraps from flask import current_app # To parse out the base graph from pyld import jsonld -from pyld.jsonld import set_document_loader # docloader caching import requests @@ -13,7 +12,7 @@ from sqlalchemy.exc import ProgrammingError -from flaskapp.utilities import is_quads, quads_to_triples +from flaskapp.utilities import quads_to_triples, checksum_json from flaskapp.storage_utilities.record import get_record, record_create """ @@ -133,10 +132,35 @@ def base_graph_filter(basegraphobj, fqdn_id): return set() +# Not limiting the context cache, as there are typically only a few in play +# in most services. Even a hundred would be fine. +_PREFIX_CACHE = {} + + +def cache_context_prefixes(f): + @wraps(f) + def wrapper(*args, **kwargs): + if data := args[0]: + strhash = checksum_json(data) + dnow = datetime.timestamp(datetime.now()) + gap = dnow - current_app.config["CONTEXTPREFIX_TTL"] + # If this context has not been resolved yet, or + # if it has but the results are too old (older than the gap), + # regenerate the result + if ( + strhash not in _PREFIX_CACHE + or _PREFIX_CACHE.get(strhash, [None, 0])[1] < gap + ): + _PREFIX_CACHE[strhash] = (f(*args, **kwargs), dnow) + + return _PREFIX_CACHE[strhash][0] + return {} + + return wrapper + + def get_url_prefixes_from_context(context_json): # Get the list of mapped prefixes (eg 'rdfs') from the context - # TODO - investigate caching this function as a later feature PR - # (only a handful of contexts are in use and the response will be the same.) proc = jsonld.JsonLdProcessor() options = { "isFrame": False, From 66e44f2078827f74f3ee86a23ffc7980cebeb89a Mon Sep 17 00:00:00 2001 From: "bosteen@getty.edu" Date: Thu, 19 Sep 2024 15:29:13 -0700 Subject: [PATCH 7/9] Adding more to the test --- source/web-service/tests/conftest.py | 1 + source/web-service/tests/test_routes_records.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/source/web-service/tests/conftest.py b/source/web-service/tests/conftest.py index 09cbf23a..e0f47c99 100644 --- a/source/web-service/tests/conftest.py +++ b/source/web-service/tests/conftest.py @@ -198,6 +198,7 @@ def _sample_record(): "dc": "http://purl.org/dc/elements/1.1/", "rdfs": "http://www.w3.org/2000/01/rdf-schema#", "_label": {"@id": "http://www.w3.org/2000/01/rdf-schema#label"}, + "_prefixedlabel": {"@id": "rdfs:label"}, }, "@id": "rdfsample1", "rdfs:seeAlso": [ diff --git a/source/web-service/tests/test_routes_records.py b/source/web-service/tests/test_routes_records.py index 102979b0..5d5017af 100644 --- a/source/web-service/tests/test_routes_records.py +++ b/source/web-service/tests/test_routes_records.py @@ -158,13 +158,13 @@ def test_prefix_record_w_rdf_prefixes( assert response.status_code == 200 data = response.get_json() - print(data) # make sure the main relative id has been prefixed assert not data["@id"].startswith("rdfsample1") # make sure that the RDF prefixed IDs have been left alone. assert data["rdfs:seeAlso"][0]["@id"] == "dc:description" + assert data["@context"]["_prefixedlabel"]["@id"] == "rdfs:label" def test_prefix_record_ids_none( self, sample_data_with_ids, client, namespace, current_app From 8039c1df4424ea10bd459c82ba5e91eed5b28569 Mon Sep 17 00:00:00 2001 From: "bosteen@getty.edu" Date: Thu, 19 Sep 2024 15:40:43 -0700 Subject: [PATCH 8/9] Adding a small note about the id prefixing --- .env.example | 7 +++++++ README.md | 2 ++ 2 files changed, 9 insertions(+) diff --git a/.env.example b/.env.example index 4006461b..6fb4557e 100644 --- a/.env.example +++ b/.env.example @@ -89,6 +89,13 @@ SUBADDRESSING=True # Can use GET param '?relativeid=true' to skip prefixing on request PREFIX_RECORD_IDS=RECURSIVE +# Id prefixing will not prefix any RDF shorthand URIs (eg 'rdfs:label') in the `id`/`@id` property +# It does this by resolving the context, if present, and getting the list of prefixes to ignore +# NB this only affects prefixes used with a colon (eg 'rdf:type' will be skipped if 'rdf' is in the context but +# 'rdf/type' will be prefixed by the server's host and subpath as normal.) +# The prefixes for a given context will be cached for a number of seconds (default 12 hours as below): +CONTEXTPREFIX_TTL=43200 + # RDF Context Cache ## LOD Gateway runs a cache for context documents, defaulting to 30 minutes in cache per context RDF_CONTEXT_CACHE_EXPIRES=30 diff --git a/README.md b/README.md index ff668254..8a9166a8 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,8 @@ When ingesting records into the LOD Gateway, any top-level `"id"` properties in For example, when ingesting the record for Vincent van Gogh's _Irises_ (1889) into an LOD Gateway instance deployed at `https://data.getty.edu/museum/collection`, the `"id"` property MUST have an `"id"` value with a relative URI of `"object/c88b3df0-de91-4f5b-a9ef-7b2b9a6d8abb"` resulting in the Gateway serving the record via the absolute URI of `https://data.getty.edu/museum/collection/object/c88b3df0-de91-4f5b-a9ef-7b2b9a6d8abb`. The Gateway will also insert the URL prefix into the `"id"` values before returning the response, converting any relative URIs in the document to absolute URIs that can be resolved by downstream systems. +If there is a JSON-LD context present, the id prefixing will not affect any valid RDF-prefixed `"id"` values. The system will process the context to find the configured list (eg `rdf`, `rdfs`, `crm`, etc), and will not affect any `id` values that use those. + The following code sample illustrates ingesting a record into an LOD Gateway instance, including how to supply the `Authorization` header, how to prepare the line-delimited `POST` body containing one or more serialized JSON/JSON-LD strings, and how if desired to submit multiple records as part of a single request: ``` From 33857e19fe7af93415da5e140f8c68791db0c818 Mon Sep 17 00:00:00 2001 From: "bosteen@getty.edu" Date: Thu, 19 Sep 2024 15:45:30 -0700 Subject: [PATCH 9/9] Turning on the caching system --- source/web-service/flaskapp/base_graph_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/source/web-service/flaskapp/base_graph_utils.py b/source/web-service/flaskapp/base_graph_utils.py index 067d8883..f4d7d870 100644 --- a/source/web-service/flaskapp/base_graph_utils.py +++ b/source/web-service/flaskapp/base_graph_utils.py @@ -159,6 +159,7 @@ def wrapper(*args, **kwargs): return wrapper +@cache_context_prefixes def get_url_prefixes_from_context(context_json): # Get the list of mapped prefixes (eg 'rdfs') from the context proc = jsonld.JsonLdProcessor()