Skip to content

Commit

Permalink
Merge pull request #212 from khaeru/enh/read-pandas
Browse files Browse the repository at this point in the history
Convert pandas.DataFrame as if it were SDMX-CSV
  • Loading branch information
khaeru authored Dec 16, 2024
2 parents 4a44cbe + 2587663 commit 41a2379
Show file tree
Hide file tree
Showing 19 changed files with 499 additions and 153 deletions.
3 changes: 3 additions & 0 deletions doc/api/reader.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,8 @@ Reader API
.. automodule:: sdmx.reader
:members:

.. autoclass:: sdmx.reader.base.Converter
:members:

.. autoclass:: sdmx.reader.base.BaseReader
:members:
6 changes: 6 additions & 0 deletions doc/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ What's new?
Next release
============

- Add :func:`.to_sdmx` and :class:`.DataFrameConverter` to allow converting :class:`.pandas.DataFrame` as if it were SDMX-CSV (:pull:`212`).

- See also :class:`.Converter`, :data:`.CONVERTER`, :func:`.get_converter` for opportunities to extend this generic capability.
- Add :func:`.get_reader`; deprecate :func:`.detect_content_reader`, :func:`.get_reader_for_media_type`, :func:`.get_reader_for_path`.
- Add :meth:`.BaseReader.handles` and :attr:`.binary_content_startswith`; deprecate :meth:`~.BaseReader.detect`, :meth:`~.BaseReader.supports_suffix`, :meth:`~.BaseReader.handles_media_type`.

- Improve tolerance of invalid references in SDMX-ML (:pull:`207`; thanks :gh-user:`nicolas-graves` for :issue:`205`).
Where a file gives a reference for a :attr:`.Component.concept_identity` (such as for a :class:`.Dimension` or :class:`.PrimaryMeasure`) that is invalid—that is, the specified :class:`.Concept` does not exist in the referenced :class:`.ConceptScheme`—log on level :data:`logging.WARNING` and discard the reference.
Previously such invalid references caused a :class:`KeyError`.
Expand Down
3 changes: 2 additions & 1 deletion sdmx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from sdmx.client import Client, Request, read_url
from sdmx.format.xml.common import install_schemas, validate_xml
from sdmx.reader import read_sdmx
from sdmx.reader import read_sdmx, to_sdmx
from sdmx.rest import Resource
from sdmx.source import add_source, list_sources
from sdmx.writer import to_csv, to_pandas, to_xml
Expand All @@ -21,6 +21,7 @@
"to_csv",
"to_pandas",
"to_xml",
"to_sdmx",
"validate_xml",
]

Expand Down
10 changes: 5 additions & 5 deletions sdmx/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from sdmx.model import common
from sdmx.model.v21 import DataStructureDefinition
from sdmx.reader import get_reader_for_media_type
from sdmx.reader import get_reader
from sdmx.rest import Resource
from sdmx.session import ResponseIO, Session
from sdmx.source import NoSource, list_sources, sources
Expand Down Expand Up @@ -481,19 +481,19 @@ def get(
)

# Select reader class
content_type = response.headers.get("content-type", None)
try:
Reader = get_reader_for_media_type(content_type)
Reader = get_reader(response)
except ValueError:
raise ValueError(
f"can't determine a reader for response content type {content_type!r}"
"can't determine a reader for response content type "
+ repr(response.headers.get("content-type", None))
) from None

# Instantiate reader
reader = Reader()

# Parse the message, using any provided or auto-queried DSD
msg = reader.read_message(response_content, structure=kwargs.get("dsd", None))
msg = reader.convert(response_content, structure=kwargs.get("dsd", None))

# Store the HTTP response with the message
msg.response = response
Expand Down
236 changes: 155 additions & 81 deletions sdmx/reader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,126 +1,200 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, Union
from warnings import warn

from . import csv, json, xml

#: Reader classes
READERS = [csv.Reader, json.Reader, xml.Reader]
if TYPE_CHECKING:
import io
from typing import TypeVar

import sdmx.message
import sdmx.reader.base

def _readers():
return ", ".join(map(lambda cls: cls.__name__, READERS))
T = TypeVar("T", bound=sdmx.reader.base.Converter)


def detect_content_reader(content):
"""Return a reader class for `content`.
#: All converters. Application code **may** extend this collection with custom
#: sub-classes of :class:`.Converter`.
CONVERTER = [csv.DataFrameConverter, csv.Reader, json.Reader, xml.Reader]

The :meth:`.BaseReader.detect` method for each class in :data:`READERS` is called;
if a reader signals that it is compatible with `content`, then that class is
returned.
#: Only Readers for standard SDMX formats.
READERS = [csv.Reader, json.Reader, xml.Reader]

Raises
------
ValueError
If no reader class matches.

def detect_content_reader(content) -> type["sdmx.reader.base.BaseReader"]:
"""Return a reader class for :class:`bytes` `content`.
.. deprecated:: 2.20.0
Use :func:`get_reader` instead.
"""
for cls in READERS:
if cls.detect(content):
return cls
warn(
"detect_content_reader(bytes); use get_reader() instead",
DeprecationWarning,
stacklevel=2,
)
return get_reader(content)

raise ValueError(f"{repr(content)} not recognized by any of {_readers()}")

def _get(data: Any, kwargs: Optional[dict], _classes: list[type["T"]]) -> type["T"]:
for c in _classes:
if c.handles(data, kwargs or {}):
return c

def get_reader_for_media_type(value):
"""Return a reader class for HTTP content/media type `value`.
raise ValueError(
f"{data!r} not recognized by any of "
+ ", ".join(map(lambda c: c.__name__, _classes))
)

Raises
------
ValueError
If no reader class matches.

See also
--------
BaseReader.media_type
"""
for cls in READERS:
if cls.handles_media_type(value):
return cls
def get_converter(
data: Any, kwargs: Optional[dict] = None
) -> type["sdmx.reader.base.Converter"]:
"""Identify a :class:`Converter` or :class:`.Reader` for `data`.
raise ValueError(f"Media type {value!r} not supported by any of {_readers()}")
For each class in :data:`CONVERTER`, the :meth:`.Converter.handles` or
:meth:`.BaseReader.handles` method is called with `data` and `kwargs`.
`data` may include:
def get_reader_for_path(path):
"""Return a reader class for file `path`.
- :class:`bytes` —same behaviour as deprecated :func:`.detect_content_reader`.
- :class:`requests.Response` —same behaviour as deprecated
:func:`.get_reader_for_media_type`.
- :class:`pathlib.Path` —same behaviour as deprecated :func:`.get_reader_for_path`.
…or, anything else that is handled by a class listed in :data:`CONVERTER`.
Raises
------
ValueError
If no reader class matches.
if none of the Converter classes can convert `data` and `kwargs` to SDMX.
"""
return _get(data, kwargs, CONVERTER)

See also
--------
BaseReader.suffixes

def get_reader(
data: Any,
kwargs: Optional[dict] = None,
_classes: list[type["sdmx.reader.base.BaseReader"]] = READERS,
) -> type["sdmx.reader.base.BaseReader"]:
"""Identify a :class:`.Reader` for `data`.
Identical to :func:`.get_converter`, except only :data:`READERS` for SDMX standard
formats are returned.
"""
suffix = Path(path).suffix
for cls in READERS:
if cls.supports_suffix(suffix):
return cls
return _get(data, kwargs, READERS)

raise ValueError(f"File suffix {repr(suffix)} not supported by any of {_readers()}")

def get_reader_for_media_type(value) -> type["sdmx.reader.base.BaseReader"]:
"""Return a reader class for HTTP content/media type `value`.
def read_sdmx(filename_or_obj, format=None, **kwargs):
"""Load a SDMX-ML or SDMX-JSON message from a file or file-like object.
.. deprecated:: 2.20.0
Use :func:`get_reader` instead.
"""
from requests import Response

Parameters
----------
filename_or_obj : str or :class:`~os.PathLike` or file
format : 'XML' or 'JSON', optional
warn(
"get_reader_for_media_type(str); use get_reader(requests.Response) instead",
DeprecationWarning,
stacklevel=2,
)

Other Parameters
----------------
dsd : :class:`DataStructureDefinition <.BaseDataStructureDefinition>`
For “structure-specific” `format`=``XML`` messages only.
# Use `value` as Content-Type header for an otherwise-empty Response
resp = Response()
resp.headers["content-type"] = value

try:
return get_reader(resp)
except ValueError as e:
*_, names = e.args[0].partition(" any of ")
raise ValueError(f"Media type {value!r} not supported by any of {names}")


def get_reader_for_path(path) -> type["sdmx.reader.base.BaseReader"]:
"""Return a reader class for file `path`.
.. deprecated:: 2.20.0
Use :func:`get_reader` instead.
"""
reader = None
warn(
"get_reader_for_path(…); use get_reader() instead",
DeprecationWarning,
stacklevel=2,
)

p = Path(path)
try:
path = Path(filename_or_obj)
return get_reader(p)
except ValueError as e:
*_, names = e.args[0].partition(" any of ")
raise ValueError(f"File suffix {p.suffix!r} not supported by any of {names}")

# Open the file
obj = open(path, "rb")
except TypeError:
# Not path-like → opened file
path = None
obj = filename_or_obj

if path:
try:
# Use the file extension to guess the reader
reader = get_reader_for_path(filename_or_obj)
except ValueError:
pass
def read_sdmx(
filename_or_obj: Union[bytes, str, Path, "io.IOBase", "io.BufferedReader"],
format: Optional[str] = None,
**kwargs,
) -> "sdmx.message.Message":
"""Read a :class:`.Message` from a path, file, or stream in an SDMX standard format.
if not reader:
try:
reader = get_reader_for_path(Path(f"dummy.{format.lower()}"))
except (AttributeError, ValueError):
pass
To identify whether `filename_or_obj` contains SDMX-CSV, SDMX-JSON, or SDMX-ML,
:meth:`.BaseReader.handles` is called.
if not reader:
# Read a line and then return the cursor to the initial position
pos = obj.tell()
first_line = obj.readline().strip()
obj.seek(pos)
Parameters
----------
filename_or_obj :
may include:
- :class:`str` or :class:`pathlib.Path`: path to a particular file.
- :class:`bytes`: raw/binary SDMX content.
- :class:`io.IOBase`: a buffer, opened file, or other I/O object containing
binary SDMX content.
format : 'CSV', 'XML', or 'JSON', optional
force handling `filename_or_obj` as if it had the given extension, even if
:meth:`~.BaseReader.handles` fails to match.
Other Parameters
----------------
structure :
:class:`.Structure`, :class:`.StructureUsage`, or other information used by a
:class:`.BaseReader` to interpret the content of `filename_or_obj`. For example,
the :class:`DataStructureDefinition <.BaseDataStructureDefinition>` for a
structure-specific SDMX-ML message.
"""
if isinstance(filename_or_obj, (str, Path)):
path = Path(filename_or_obj) # Ensure Path type
obj: Union[bytes, "io.IOBase"] = open(path, "rb") # Open the file
else:
path, obj = None, filename_or_obj

# Try to identify a reader by first the path, then by the `obj` content
for candidate in path, obj, Path(f"_.{(format or 'MISSING').lower()}"):
try:
reader = detect_content_reader(first_line)
reader = get_reader(candidate, kwargs)
except ValueError:
pass
reader = None
else:
break

if not reader:
raise RuntimeError(
f"cannot infer SDMX message format from path {repr(path)}, "
f"format={format}, or content '{first_line[:5].decode()}..'"
f"cannot infer SDMX message format from path {path!r}, format "
f"hint={format}, or content"
)

return reader().read_message(obj, **kwargs)
return reader().convert(obj, **kwargs)


def to_sdmx(data, **kwargs) -> "sdmx.message.Message":
"""Convert `data` in non-SDMX formats and data structures to SDMX :class:`.Message`.
Unlike :func:`.read_sdmx`, which handles only the standard SDMX formats SDMX-CSV,
SDMX-JSON, and SDMX-ML, this method can will process any Python data structure
handled by a known :data:`CONVERTER`.
"""
try:
converter = get_converter(data, kwargs)
except ValueError:
raise NotImplementedError(f"Convert {type(data)} {data!r} to SDMX")

return converter().convert(data, **kwargs)
Loading

0 comments on commit 41a2379

Please sign in to comment.