From f101d6666c2b4adeaf6beb53a357402739edb95a Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 4 Nov 2024 09:35:14 -0500 Subject: [PATCH 1/5] RF: replace use of "identifiers" with "alternateIdentifiers" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Situation with "identifiers" is messy. We relied on it but it was not in datacite schema, but was allowed by API: https://support.datacite.org/docs/what-is-the-identifiers-attribute-in-the-rest-api > When creating or updating DOI alternateIdentifier metadata, the REST API accepts values in either the alternateIdentifiers or identifiers attributes. Including metadata in either attribute will populate the identifiers and alternateIdentifiers attributes in the REST API response and the alternateIdentifiers property in DataCite XML. And in jsonschema serialization of 4.5 "identifiers" was removed, see more in https://github.com/inveniosoftware/datacite/issues/81#issuecomment-1921727400 and there in. But I guess currently used 4.3 from datacite (not inveniosoftware) is still requiring identifiers, and hence this commit/solution is incomplete since does fail validation (see below). "identifiers" was removed from required only in 4.5 from inveniosoftware. ❯ python -m pytest -s -v dandischema/tests/test_datacite.py ============================================================= test session starts ============================================================== platform linux -- Python 3.12.6, pytest-8.3.3, pluggy-1.5.0 -- /home/yoh/proj/dandi/dandischema/venv/3/bin/python cachedir: .pytest_cache rootdir: /home/yoh/proj/dandi/dandischema configfile: tox.ini plugins: rerunfailures-14.0, cov-6.0.0 collected 14 items dandischema/tests/test_datacite.py::test_datacite[000004] FAILED dandischema/tests/test_datacite.py::test_datacite[000008] FAILED dandischema/tests/test_datacite.py::test_dandimeta_datacite[additional_meta0-datacite_checks0] FAILED dandischema/tests/test_datacite.py::test_dandimeta_datacite[additional_meta1-datacite_checks1] FAILED dandischema/tests/test_datacite.py::test_dandimeta_datacite[additional_meta2-datacite_checks2] FAILED dandischema/tests/test_datacite.py::test_dandimeta_datacite[additional_meta3-datacite_checks3] FAILED dandischema/tests/test_datacite.py::test_dandimeta_datacite[additional_meta4-datacite_checks4] FAILED dandischema/tests/test_datacite.py::test_dandimeta_datacite[additional_meta5-datacite_checks5] FAILED dandischema/tests/test_datacite.py::test_dandimeta_datacite[additional_meta6-datacite_checks6] FAILED dandischema/tests/test_datacite.py::test_datacite_publish PASSED dandischema/tests/test_datacite.py::test_datacite_related_res_url[related_res_url0-related_ident_exp0] PASSED dandischema/tests/test_datacite.py::test_datacite_related_res_url[related_res_url1-related_ident_exp1] PASSED dandischema/tests/test_datacite.py::test_datacite_related_res_url[related_res_url2-related_ident_exp2] PASSED dandischema/tests/test_datacite.py::test_datacite_related_res_url[related_res_url3-related_ident_exp3] PASSED =================================================================== FAILURES =================================================================== ____________________________________________________________ test_datacite[000004] _____________________________________________________________ dandischema/tests/test_datacite.py:160: in test_datacite datacite = to_datacite(meta=meta, validate=True) dandischema/datacite.py:238: in to_datacite validate_datacite(datacite_dict) dandischema/datacite.py:258: in validate_datacite validator.validate(datacite_dict["data"]["attributes"]) venv/3/lib/python3.12/site-packages/jsonschema/validators.py:451: in validate raise error E jsonschema.exceptions.ValidationError: 'identifiers' is a required property E E Failed validating 'required' in schema: E {'$schema': 'http://json-schema.org/draft-07/schema#', E 'definitions': {'nameType': {'type': 'string', E 'enum': ['Organizational', 'Personal']}, E 'nameIdentifiers': {'type': 'array', E 'items': {'type': 'object', E 'properties': {'nameIdentifier': {'type': 'string'}, E 'nameIdentifierScheme': {'type': 'string'}, E 'schemeURI': {'type': 'string', E 'format': 'uri'}}, E 'required': ['nameIdentifier', E 'nameIdentifierScheme']}, --- dandischema/datacite/__init__.py | 15 +++++++++------ dandischema/datacite/tests/test_datacite.py | 14 +++++++------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/dandischema/datacite/__init__.py b/dandischema/datacite/__init__.py index 8a8040d..98365f5 100644 --- a/dandischema/datacite/__init__.py +++ b/dandischema/datacite/__init__.py @@ -78,16 +78,19 @@ def to_datacite( if publish: attributes["event"] = "publish" - attributes["identifiers"] = [ + attributes["alternateIdentifiers"] = [ # TODO: the first element is ignored, not sure how to fix it... - {"identifier": f"https://doi.org/{meta.doi}", "identifierType": "DOI"}, { - "identifier": f"https://identifiers.org/{meta.id}", - "identifierType": "URL", + "alternateIdentifier": f"https://doi.org/{meta.doi}", + "alternateIdentifierType": "DOI", }, { - "identifier": str(meta.url), - "identifierType": "URL", + "alternateIdentifier": f"https://identifiers.org/{meta.id}", + "alternateIdentifierType": "URL", + }, + { + "alternateIdentifier": str(meta.url), + "alternateIdentifierType": "URL", }, ] diff --git a/dandischema/datacite/tests/test_datacite.py b/dandischema/datacite/tests/test_datacite.py index f1d129a..d3ab158 100644 --- a/dandischema/datacite/tests/test_datacite.py +++ b/dandischema/datacite/tests/test_datacite.py @@ -441,24 +441,24 @@ def test_datacite_publish(metadata_basic: Dict[str, Any]) -> None: {"description": "testing", "descriptionType": "Abstract"} ], "doi": f"10.80507/dandi.{dandi_id_noprefix}/{version}", - "identifiers": [ + "alternateIdentifiers": [ { - "identifier": ( + "alternateIdentifier": ( f"https://doi.org/10.80507" f"/dandi.{dandi_id_noprefix}/{version}" ), - "identifierType": "DOI", + "alternateIdentifierType": "DOI", }, { - "identifier": f"https://identifiers.org/{dandi_id}/{version}", - "identifierType": "URL", + "alternateIdentifier": f"https://identifiers.org/{dandi_id}/{version}", + "alternateIdentifierType": "URL", }, { - "identifier": ( + "alternateIdentifier": ( f"https://dandiarchive.org/dandiset" f"/{dandi_id_noprefix}/{version}" ), - "identifierType": "URL", + "alternateIdentifierType": "URL", }, ], "publicationYear": "1970", From 2095d3e04a90ac88afa5353f97b3ffe7b9c5f677 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 4 Nov 2024 10:11:38 -0500 Subject: [PATCH 2/5] RF: "upgrade" to datacite v4.5 jsonschema serialization, now from inveniosoftware Done in hope to see "non-standard" identifiers being gone but immediate fail is ___ test_dandimeta_datacite[additional_meta6-datacite_checks6] _ dandischema/tests/test_datacite.py:407: in test_dandimeta_datacite validator.validate(datacite["data"]["attributes"]) venv/3/lib/python3.12/site-packages/jsonschema/validators.py:451: in validate raise error E jsonschema.exceptions.ValidationError: 'DANDI Archive' is not of type 'object' E E Failed validating 'type' in schema['properties']['publisher']: E {'type': 'object', E 'additionalProperties': False, E 'properties': {'name': {'type': 'string'}, E 'publisherIdentifier': {'type': 'string'}, E 'publisherIdentifierScheme': {'type': 'string'}, E 'schemeUri': {'type': 'string', 'format': 'uri'}, E 'lang': {'type': 'string'}}, E 'required': ['name']} E E On instance['publisher']: E 'DANDI Archive' So we need to standardize "publisher" better --- dandischema/datacite/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandischema/datacite/__init__.py b/dandischema/datacite/__init__.py index 98365f5..238a5d0 100644 --- a/dandischema/datacite/__init__.py +++ b/dandischema/datacite/__init__.py @@ -255,7 +255,7 @@ def to_datacite( @lru_cache() -def _get_datacite_schema(version_id: str = "datacite-4.3-17-gaa5db56") -> Any: +def _get_datacite_schema(version_id: str = "inveniosoftware-4.5-81-g160250d") -> Any: """Load datacite schema based on the version id provided.""" schema_folder = Path(__file__).parent / "schema" return json.loads((schema_folder / f"{version_id}.json").read_text()) From dc1916835a61aef8db84ae7957337167fed63df4 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 4 Nov 2024 18:44:09 -0500 Subject: [PATCH 3/5] Elaborate "publisher" into expected record, and schemeURI -> schemeUri --- dandischema/datacite/__init__.py | 12 +++++++--- dandischema/datacite/tests/test_datacite.py | 25 ++++++++++++++++----- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/dandischema/datacite/__init__.py b/dandischema/datacite/__init__.py index 238a5d0..96e17c2 100644 --- a/dandischema/datacite/__init__.py +++ b/dandischema/datacite/__init__.py @@ -101,7 +101,13 @@ def to_datacite( attributes["descriptions"] = [ {"description": meta.description, "descriptionType": "Abstract"} ] - attributes["publisher"] = "DANDI Archive" + attributes["publisher"] = { + "name": "DANDI Archive", + "schemeUri": "https://scicrunch.org/resolver/", + "publisherIdentifier": "https://scicrunch.org/resolver/RRID:SCR_017571", + "publisherIdentifierScheme": "RRID", + "lang": "en", + } attributes["publicationYear"] = str(meta.datePublished.year) # not sure about it dandi-api had "resourceTypeGeneral": "NWB" attributes["types"] = { @@ -113,7 +119,7 @@ def to_datacite( # assuming that all licenses are from SPDX? attributes["rightsList"] = [ { - "schemeURI": "https://spdx.org/licenses/", + "schemeUri": "https://spdx.org/licenses/", "rightsIdentifierScheme": "SPDX", "rightsIdentifier": el.name, } @@ -150,7 +156,7 @@ def to_datacite( contr_dict: Dict[str, Any] = { "name": contr_el.name, "contributorName": contr_el.name, - "schemeURI": "orcid.org", + "schemeUri": "orcid.org", } if isinstance(contr_el, Person): contr_dict["nameType"] = "Personal" diff --git a/dandischema/datacite/tests/test_datacite.py b/dandischema/datacite/tests/test_datacite.py index d3ab158..43d7480 100644 --- a/dandischema/datacite/tests/test_datacite.py +++ b/dandischema/datacite/tests/test_datacite.py @@ -150,7 +150,16 @@ def test_datacite(dandi_id: str, schema: Any) -> None: 1, {"description": "testing", "descriptionType": "Abstract"}, ), - "publisher": (None, "DANDI Archive"), + "publisher": ( + None, + { + "name": "DANDI Archive", + "publisherIdentifier": "https://scicrunch.org/resolver/RRID:SCR_017571", + "publisherIdentifierScheme": "RRID", + "schemeUri": "https://scicrunch.org/resolver/", + "lang": "en", + }, + ), "rightsList": ( 1, {"rightsIdentifierScheme": "SPDX", "rightsIdentifier": "CC_BY_40"}, @@ -423,7 +432,7 @@ def test_datacite_publish(metadata_basic: Dict[str, Any]) -> None: "givenName": "A_first", "name": "A_last, A_first", "nameType": "Personal", - "schemeURI": "orcid.org", + "schemeUri": "orcid.org", } ], "creators": [ @@ -434,7 +443,7 @@ def test_datacite_publish(metadata_basic: Dict[str, Any]) -> None: "givenName": "A_first", "name": "A_last, A_first", "nameType": "Personal", - "schemeURI": "orcid.org", + "schemeUri": "orcid.org", } ], "descriptions": [ @@ -462,12 +471,18 @@ def test_datacite_publish(metadata_basic: Dict[str, Any]) -> None: }, ], "publicationYear": "1970", - "publisher": "DANDI Archive", + "publisher": { + "name": "DANDI Archive", + "publisherIdentifier": "https://scicrunch.org/resolver/RRID:SCR_017571", + "publisherIdentifierScheme": "RRID", + "schemeUri": "https://scicrunch.org/resolver/", + "lang": "en", + }, "rightsList": [ { "rightsIdentifier": "CC_BY_40", "rightsIdentifierScheme": "SPDX", - "schemeURI": "https://spdx.org/licenses/", + "schemeUri": "https://spdx.org/licenses/", } ], "schemaVersion": "http://datacite.org/schema/kernel-4", From ba9df7f99eb48cc15f695e3bf0689b70ba59f55b Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 20 Nov 2024 12:49:25 -0500 Subject: [PATCH 4/5] fix: nameIdentifiers should also be URLs This makes it all consistend with funderIdentifier, alternateIdentifier and may be others. rightsIdentifier was found to be different (thereis rightsURI, no schemeUri) --- dandischema/datacite/__init__.py | 2 +- dandischema/datacite/tests/test_datacite.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dandischema/datacite/__init__.py b/dandischema/datacite/__init__.py index 96e17c2..93e05be 100644 --- a/dandischema/datacite/__init__.py +++ b/dandischema/datacite/__init__.py @@ -172,7 +172,7 @@ def to_datacite( contr_dict["affiliation"] = [] if getattr(contr_el, "identifier"): orcid_dict = { - "nameIdentifier": contr_el.identifier, + "nameIdentifier": f"https://orcid.org/{contr_el.identifier}", "nameIdentifierScheme": "ORCID", "schemeUri": "https://orcid.org/", } diff --git a/dandischema/datacite/tests/test_datacite.py b/dandischema/datacite/tests/test_datacite.py index 43d7480..862b238 100644 --- a/dandischema/datacite/tests/test_datacite.py +++ b/dandischema/datacite/tests/test_datacite.py @@ -311,7 +311,7 @@ def test_datacite(dandi_id: str, schema: Any) -> None: "name": "A_last, A_first", "nameIdentifiers": [ { - "nameIdentifier": "0000-0001-0000-0000", + "nameIdentifier": "https://orcid.org/0000-0001-0000-0000", "nameIdentifierScheme": "ORCID", "schemeUri": "https://orcid.org/", } @@ -325,7 +325,7 @@ def test_datacite(dandi_id: str, schema: Any) -> None: "contributorType": "Other", "nameIdentifiers": [ { - "nameIdentifier": "0000-0001-0000-0000", + "nameIdentifier": "https://orcid.org/0000-0001-0000-0000", "nameIdentifierScheme": "ORCID", "schemeUri": "https://orcid.org/", } From c48079cd6e566c8de5117cf1ac0c6210eb2a8716 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 20 Nov 2024 13:11:02 -0500 Subject: [PATCH 5/5] fix: remove the "DOI" from alternateIdentifiers It kinda historically dragged through time from the original 1056afb49fd945afc471e200d782bbff9de43cf1 in dandi-cli where it was added to "identifiers". As the wise @tmorell has mentioned, since it is the datacite which is to provide DOI, it would ignore DOI "alternateIdentifiers". I think it makes sensse overall, although there could potentially be multiple DOIs for a single dandiset -- nothing in DOI principle forbids it. But since original purpose here is not clear -- we better just strip it away since it should be the DOI minted by datacite fabrica as the one for the PublishedDandiset --- dandischema/datacite/__init__.py | 5 ----- dandischema/datacite/tests/test_datacite.py | 7 ------- 2 files changed, 12 deletions(-) diff --git a/dandischema/datacite/__init__.py b/dandischema/datacite/__init__.py index 93e05be..9475c43 100644 --- a/dandischema/datacite/__init__.py +++ b/dandischema/datacite/__init__.py @@ -79,11 +79,6 @@ def to_datacite( attributes["event"] = "publish" attributes["alternateIdentifiers"] = [ - # TODO: the first element is ignored, not sure how to fix it... - { - "alternateIdentifier": f"https://doi.org/{meta.doi}", - "alternateIdentifierType": "DOI", - }, { "alternateIdentifier": f"https://identifiers.org/{meta.id}", "alternateIdentifierType": "URL", diff --git a/dandischema/datacite/tests/test_datacite.py b/dandischema/datacite/tests/test_datacite.py index 862b238..c9e0d72 100644 --- a/dandischema/datacite/tests/test_datacite.py +++ b/dandischema/datacite/tests/test_datacite.py @@ -451,13 +451,6 @@ def test_datacite_publish(metadata_basic: Dict[str, Any]) -> None: ], "doi": f"10.80507/dandi.{dandi_id_noprefix}/{version}", "alternateIdentifiers": [ - { - "alternateIdentifier": ( - f"https://doi.org/10.80507" - f"/dandi.{dandi_id_noprefix}/{version}" - ), - "alternateIdentifierType": "DOI", - }, { "alternateIdentifier": f"https://identifiers.org/{dandi_id}/{version}", "alternateIdentifierType": "URL",