From 124ccb379201e21fd952cd505bfbc82dc316903c Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 5 Nov 2024 12:03:07 +0100 Subject: [PATCH 1/7] Add specimen for #205 --- sdmx/testing/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdmx/testing/__init__.py b/sdmx/testing/__init__.py index 4008456fc..737bba73b 100644 --- a/sdmx/testing/__init__.py +++ b/sdmx/testing/__init__.py @@ -291,6 +291,7 @@ def __init__(self, base_path): ("IMF_STA", "DSD_GFS.xml"), # khaeru/sdmx#164 ("INSEE", "CNA-2010-CONSO-SI-A17-structure.xml"), ("INSEE", "dataflow.xml"), + ("INSEE", "gh-205.xml"), ("INSEE", "IPI-2010-A21-structure.xml"), ("ISTAT", "22_289-structure.xml"), ("ISTAT", "47_850-structure.xml"), From 5b9a0c08f6791cddf9230c537ee78187d68b3dba Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 5 Nov 2024 12:04:55 +0100 Subject: [PATCH 2/7] Handle reference to non-existent concept identity --- sdmx/reader/xml/v21.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sdmx/reader/xml/v21.py b/sdmx/reader/xml/v21.py index 026d7a1b9..7be9ba986 100644 --- a/sdmx/reader/xml/v21.py +++ b/sdmx/reader/xml/v21.py @@ -606,12 +606,24 @@ def _component_end(reader: Reader, elem): # noqa: C901 args = dict( id=elem.attrib.get("id", common.MissingID), - concept_identity=reader.pop_resolved_ref("ConceptIdentity"), local_representation=reader.pop_single(common.Representation), ) if position := elem.attrib.get("position"): args["order"] = int(position) + # Resolve a ConceptIdentity reference + ci_ref = reader.pop_single("ConceptIdentity") + try: + args["concept_identity"] = reader.resolve(ci_ref) + except KeyError: + message = ( + f"Could not resolve {cls.__name__}.concept_identity reference to {ci_ref!s}" + ) + log.error(message) + args.setdefault("annotations", []).append( + common.Annotation(id=f"{__name__}-parse-error", text=message) + ) + # DataAttributeOnly if us := elem.attrib.get("assignmentStatus"): args["usage_status"] = model.UsageStatus[us.lower()] From 8d61d5b49e7607c59f8ad67b9a71b2b97e04fc5c Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 5 Nov 2024 12:05:13 +0100 Subject: [PATCH 3/7] Test log messages and annotations for #205 --- sdmx/tests/reader/test_reader_xml_v21.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sdmx/tests/reader/test_reader_xml_v21.py b/sdmx/tests/reader/test_reader_xml_v21.py index c69d4296f..0ad1afa06 100644 --- a/sdmx/tests/reader/test_reader_xml_v21.py +++ b/sdmx/tests/reader/test_reader_xml_v21.py @@ -236,6 +236,31 @@ def test_gh_199(): sdmx.read_sdmx(f2, structure=dsd2) +def test_gh_205(caplog, specimen) -> None: + """Test of https://github.com/khaeru/sdmx/issues/205.""" + with specimen("INSEE/gh-205.xml") as f: + msg = sdmx.read_sdmx(f) + + # Messages were logged + msg_template = "Could not resolve {cls}.concept_identity reference to ConceptScheme=FR1:CONCEPTS_INSEE(1.0) → Concept={id}" + m1 = msg_template.format(cls="TimeDimension", id="TIME_PERIOD") + m2 = msg_template.format(cls="PrimaryMeasure", id="OBS_VALUE") + assert m1 in caplog.messages + assert m2 in caplog.messages + + # Access the parsed DSD + dsd = msg.structure["CNA-2014-PIB"] + + # Components have annotations with expected ID and text + for component, text in ( + (dsd.dimensions.get("TIME_PERIOD"), m1), + (dsd.measures.get("OBS_VALUE"), m2), + ): + a = component.annotations[0] + assert "sdmx.reader.xml.v21-parse-error" == a.id + assert text == str(a.text) + + # Each entry is a tuple with 2 elements: # 1. an instance of lxml.etree.Element to be parsed. # 2. Either: From 0fac341a4153011e6d52f7401a8a1d7dee2f59f1 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 5 Nov 2024 12:06:17 +0100 Subject: [PATCH 4/7] Add "DataStructure" to .model.common.PACKAGE --- sdmx/model/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdmx/model/common.py b/sdmx/model/common.py index 79a527277..96b879707 100644 --- a/sdmx/model/common.py +++ b/sdmx/model/common.py @@ -2601,9 +2601,10 @@ class BaseContentConstraint: }, "conceptscheme": {"Concept", "ConceptScheme"}, "datastructure": { - "DataflowDefinition", # SDMX 2.1 "Dataflow", # SDMX 3.0 - "DataStructureDefinition", + "DataflowDefinition", # SDMX 2.1 + "DataStructure", # SDMX 3.0 + "DataStructureDefinition", # SDMX 2.1 "StructureUsage", }, "mapping": {"CodelistMap", "StructureSet"}, From daed7934c808a515b5c59c6652f4d6ed6c8899d3 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 5 Nov 2024 12:30:14 +0100 Subject: [PATCH 5/7] Test #78/#79 with a specimen/without network --- sdmx/testing/__init__.py | 1 + sdmx/tests/reader/test_reader_xml_v21.py | 18 ++++++++++++++++++ sdmx/tests/test_sources.py | 9 --------- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/sdmx/testing/__init__.py b/sdmx/testing/__init__.py index 737bba73b..24fb5a5dc 100644 --- a/sdmx/testing/__init__.py +++ b/sdmx/testing/__init__.py @@ -310,6 +310,7 @@ def __init__(self, base_path): ("SPC", "metadatastructure-0.xml"), ("TEST", "gh-142.xml"), ("TEST", "gh-149.xml"), + ("WB", "gh-78.xml"), ] ) diff --git a/sdmx/tests/reader/test_reader_xml_v21.py b/sdmx/tests/reader/test_reader_xml_v21.py index 0ad1afa06..f0a8dd6b1 100644 --- a/sdmx/tests/reader/test_reader_xml_v21.py +++ b/sdmx/tests/reader/test_reader_xml_v21.py @@ -67,6 +67,24 @@ def test_read_ss_xml(specimen): assert len(TIME_FORMAT.related_to.dimensions) == 5 +def test_gh_078(specimen): + """Test of https://github.com/khaeru/sdmx/issues/78. + + This required adding support for :xml:`` and :xml:`` to + :mod:`.reader.xml`. + """ + # Message can be read + with specimen("WB/gh-78.xml") as f: + msg = sdmx.read_sdmx(f) + + # Sender attributes are present and have the expected values + for attr, text in ( + ("org_unit", "DECDG"), + ("responsibility", "Support"), + ): + assert text == getattr(msg.header.sender.contact[0], attr).localizations["en"] + + def test_gh_104(caplog, specimen): """Test of https://github.com/khaeru/sdmx/issues/104. diff --git a/sdmx/tests/test_sources.py b/sdmx/tests/test_sources.py index ef8298465..ccb94d95a 100644 --- a/sdmx/tests/test_sources.py +++ b/sdmx/tests/test_sources.py @@ -672,15 +672,6 @@ class TestWB(DataSourceTest): "structureset": NotImplementedError, # 501 } - @pytest.mark.network - def test_gh_78(self, client): - """Test of https://github.com/khaeru/sdmx/78. - - This response required adding support for ```` and - ```` to :mod:`.reader.xml`. - """ - client.data("DF_WITS_Tariff_TRAINS", key=".840.000.020110.reported") - class TestWB_WDI(DataSourceTest): source_id = "WB_WDI" From 1ea33641459efe7f093262eb4aa97f9f80f1e075 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 5 Nov 2024 12:35:09 +0100 Subject: [PATCH 6/7] Use https:// base URL for WB REST data source Parallel to change for WB_WDI in #192. --- sdmx/sources.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdmx/sources.json b/sdmx/sources.json index e20304ec7..8b1854545 100644 --- a/sdmx/sources.json +++ b/sdmx/sources.json @@ -415,7 +415,7 @@ { "id": "WB", "name": "World Bank World Integrated Trade Solution", - "url": "http://wits.worldbank.org/API/V1/SDMX/V21/rest", + "url": "https://wits.worldbank.org/API/V1/SDMX/V21/rest", "supports": { "actualconstraint": false, "agencyscheme": false, From 87893deb3b7be3c7ca45f6b4835b6085b8337357 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 12 Nov 2024 15:08:59 +0100 Subject: [PATCH 7/7] Add #207, #205 to doc/whatsnew --- doc/sources.rst | 4 ++++ doc/whatsnew.rst | 14 +++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/sources.rst b/doc/sources.rst index 6a98f64d0..3775cba15 100644 --- a/doc/sources.rst +++ b/doc/sources.rst @@ -274,6 +274,10 @@ Website `(en) `__, `(fr) `__ - French name: Institut national de la statistique et des études économiques. +- Known issue(s) with this data source: + + - :issue:`205`: as of 2024-11-12 some structures, for instance ``urn:sdmx:…DataStructure=FR1:CNA-2014-PIB(1.0)``, include :attr:`~.Component.concept_identity` references that do not exist, for instance ``urn:sdmx:…Concept=FR1:CONCEPTS_INSEE(1.0).TIME_PERIOD`` and ``urn:sdmx:…Concept=FR1:CONCEPTS_INSEE(1.0).OBS_VALUE``. + From :ref:`v2.20.0 <2.20.0>`, :mod:`.reader.xml.v21` discards such invalid references, leaving :py:`.concept_identity = None`. .. autoclass:: sdmx.source.insee.Source() :members: diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst index 578ad97b2..5db8e6996 100644 --- a/doc/whatsnew.rst +++ b/doc/whatsnew.rst @@ -3,8 +3,16 @@ What's new? *********** -.. Next release -.. ============ +.. _2.20.0: + +Next release +============ + +- Improve tolerance of invalid references in SDMX-ML (:pull:`207`; thanks :gh-user:`nicolas-graves` for :issue:`205`). + Where a file gives a reference for a :attr:`.Component.concept_identity` (such as for a :class:`.Dimension` or :class:`.PrimaryMeasure`) that is invalid—that is, the specified :class:`.Concept` does not exist in the referenced :class:`.ConceptScheme`—log on level :data:`logging.WARNING` and discard the reference. + Previously such invalid references caused a :class:`KeyError`. + Prompted by an example in :ref:`INSEE `. +- Update the base URL of the :ref:`WB ` source to use HTTPS instead of plain HTTP (:pull:`207`). v2.19.1 (2024-10-23) ==================== @@ -36,7 +44,7 @@ v2.17.0 (2024-09-03) - :class:`.XHTMLAttributeValue` contents are stored as :mod:`lxml.etree` nodes. - MetadataStructureDefinition is included when writing :class:`.StructureMessage`. -- Update base url for :ref:`WB_WDI` source to use HTTPS instead of plain HTTP (:issue:`191`, :pull:`192`). +- Update the base url of the :ref:`WB_WDI ` source to use HTTPS instead of plain HTTP (:issue:`191`, :pull:`192`). - Improvements to :mod:`.reader.xml` and :mod:`.reader.xml.v21` (:pull:`192`). - Correctly associate :class:`.Item` in :class:`.ItemScheme` with its parent, even if the parent is defined after the child (“forward reference”).