Skip to content

Commit

Permalink
Merge pull request #25 from NowanIlfideme/feature/inherit-configs
Browse files Browse the repository at this point in the history
Feature: Inherit configs
  • Loading branch information
NowanIlfideme authored May 11, 2023
2 parents f35316d + bfc4e1b commit b3e7774
Show file tree
Hide file tree
Showing 7 changed files with 235 additions and 22 deletions.
38 changes: 31 additions & 7 deletions docs/arbitrary_types.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,39 @@ Internally, this uses the `ParquetDataSet` to save the dataframe as an
as well as reference it from within the JSON file. That means that, unlike
Pickle, the file isn't "fragile" and will be readable with future versions.
## Known Issues
## Config Inheritence
1. Currently, the `Config` is not magically inherited by subclasses.
That means that you should explicitly inherit `YourType.Config` from `YourType`'s
base class if you want to override it. It also means that the `kedro_map`
isn't merged for subclasses; you'll need to do this explicitly for now.
2. Only the top-level model's `Config` is taken into account when serializing
[Similarly to Pydantic](https://docs.pydantic.dev/latest/usage/model_config/#change-behaviour-globally),
the `Config` class has a sort of pseudo-inheritence.
That is, if you define your classes like this:
```python
class A(BaseModel):
class Config:
allow_arbitrary_types = True
kedro_map = {Foo: FooDataSet}
class B(A):
class Config:
kedro_map = {Bar: BarDataSet}
class C(B):
class Config:
kedro_map = {Foo: foo_ds_maker}
kedro_default = DefaultDataSet
```
Then class `B` will act as if `kedro_map = {Foo: FooDataSet, Bar: BarDataSet}`,
and class `C` will act as if `kedro_map = {Foo: foo_ds_maker, Bar: BarDataSet}`
and `kedro_default = DefaultDataSet`
## Considerations
1. Only the top-level model's `Config` is taken into account when serializing
to a Kedro dataset, ignoring any children's configs.
This means that all values of a particular type are serialized the same way.
3. `pydantic` V2 is not supported yet, but V2
2. `pydantic` V2 is not supported yet, but V2
[has a different configuration method](https://docs.pydantic.dev/blog/pydantic-v2-alpha/#changes-to-config).
`pydantic-kedro` might change the configuration method entirely to be more compliant.
80 changes: 80 additions & 0 deletions src/pydantic_kedro/_internals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Functions for internal use."""

from typing import Callable, Dict, Type

from kedro.extras.datasets.pickle import PickleDataSet
from kedro.io.core import AbstractDataSet
from pydantic import BaseModel


def get_kedro_map(kls: Type[BaseModel]) -> Dict[Type, Callable[[str], AbstractDataSet]]:
"""Get type-to-dataset mapper for a Pydantic class."""
if not (isinstance(kls, type) and issubclass(kls, BaseModel)):
raise TypeError(f"Must pass a BaseModel subclass; got {kls!r}")
kedro_map: Dict[Type, Callable[[str], AbstractDataSet]] = {}
# Go through bases of `kls` in order
base_classes = reversed(kls.mro())
for base_i in base_classes:
# Get config class (if it's defined)
cfg_i = getattr(base_i, "__config__", None)
if cfg_i is None:
continue
# Get kedro_map (if it's defined)
upd = getattr(cfg_i, "kedro_map", None)
if upd is None:
continue
elif isinstance(upd, dict):
# Detailed checks (to help users fix stuff)
bad_keys = []
bad_vals = []
for k, v in upd.items():
if isinstance(k, type):
if callable(v):
kedro_map[k] = v # TODO: Check callable signature?
else:
bad_vals.append(v)
else:
bad_keys.append(k)
if len(bad_keys) > 0:
raise TypeError(f"Keys in `kedro_map` must be types, but got bad keys: {bad_keys}")
if len(bad_vals) > 0:
raise TypeError(
"Values in `kedro_map` must be callable (or types),"
f" but got bad values: {bad_vals}"
)
else:
raise TypeError(
f"The `kedro_map` in config class {base_i.__qualname__} must be a dict, but got {upd!r}"
)
return kedro_map


def get_kedro_default(kls: Type[BaseModel]) -> Callable[[str], AbstractDataSet]:
"""Get default Kedro dataset creator."""
# Go backwards through bases of `kls` until you find a default value
rev_bases = kls.mro()
for base_i in rev_bases:
# Get config class (if defined)
cfg_i = getattr(base_i, "__config__", None)
if cfg_i is None:
continue
# Get kedro_default (if it's defined)
default = getattr(cfg_i, "kedro_default", None)
if default is None:
continue
elif callable(default):
# Special check for types
if isinstance(default, type) and not issubclass(default, AbstractDataSet):
raise TypeError(
"The `kedro_default` must be an AbstractDataSet or callable that creates one,"
f" but got {default!r}"
)
# TODO: Check callable signature?
return default
else:
raise TypeError(
"The `kedro_default` must be an AbstractDataSet or callable that creates one,"
f" but got {default!r}"
)

return PickleDataSet
24 changes: 14 additions & 10 deletions src/pydantic_kedro/datasets/folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,29 @@
from fsspec import AbstractFileSystem
from fsspec.core import strip_protocol
from fsspec.implementations.local import LocalFileSystem
from kedro.extras.datasets.pickle import PickleDataSet
from kedro.io.core import AbstractDataSet, parse_dataset_definition
from pydantic import BaseConfig, BaseModel, Extra, Field
from pydantic.utils import import_string

# __all__ = ["PydanticFolderDataSet"]
from pydantic_kedro._internals import get_kedro_default, get_kedro_map

__all__ = ["PydanticFolderDataSet"]


DATA_PLACEHOLDER = "__DATA_PLACEHOLDER__"

JsonPath = str # not a "real" JSON Path, but just `.`-separated
_Bis = Union[bool, int, str, Path, None]

logger = logging.getLogger(__name__)

# Some ridiculous types to support nested configurations
_Bis = Union[bool, int, str, Path, None]
_Dis1 = Dict[str, _Bis]
_Dis2 = Dict[str, Union[_Bis, _Dis1]]
_Dis3 = Dict[str, Union[_Bis, _Dis1, _Dis2]]
_Dis4 = Dict[str, Union[_Bis, _Dis1, _Dis2, _Dis3]]
# basically, Dict[str, Union[_Bis, Dict[str, Union[_Bis, Dict[str, _Bis]]]]], but better :)


class KedroDataSetSpec(BaseModel):
"""Kedro dataset specification. This allows arbitrary extra fields, including versions.
Expand All @@ -38,7 +46,7 @@ class KedroDataSetSpec(BaseModel):

type_: str = Field(alias="type")
relative_path: str
args: Dict[str, Union[_Bis, Dict[str, Dict[str, _Bis]]]] = {}
args: _Dis4 = {}

class Config(BaseConfig):
"""Internal Pydantic model configuration."""
Expand Down Expand Up @@ -252,12 +260,8 @@ def _save_local(self, data: BaseModel, filepath: str) -> None:

# These are used to make datasets for various types
# See the `kls.Config` class - this is inherited
kedro_map: Dict[Type, Callable[[str], AbstractDataSet]] = getattr(
kls.__config__, "kedro_map", {}
)
kedro_default: Callable[[str], AbstractDataSet] = getattr(
kls.__config__, "kedro_default", PickleDataSet
)
kedro_map: Dict[Type, Callable[[str], AbstractDataSet]] = get_kedro_map(kls)
kedro_default: Callable[[str], AbstractDataSet] = get_kedro_default(kls)

def make_ds_for(obj: Any, path: str) -> AbstractDataSet:
for k, v in kedro_map.items():
Expand Down
8 changes: 5 additions & 3 deletions src/pydantic_kedro/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@ class ArbModel(BaseModel):
This also supports type hints for `pydantic_kedro` in the configuration:
- `kedro_map`, which maps a type to a dataset constructor to use.
- `kedro_default`, which specifies the default dataset type to use.
- `kedro_default`, which specifies the default dataset type to use
([kedro.extras.datasets.pickle.PickleDataSet][])
These are NOT currently inherited (TODO).
The default dataset type that's used is [kedro.extras.datasets.pickle.PickleDataSet][]
These are pseudo-inherited, see [config-inheritence][].
You do not actually need to inherit from `ArbModel` for this to work, however it can help with
type completion in your IDE.
"""

Config = ArbConfig
3 changes: 2 additions & 1 deletion src/pydantic_kedro/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def save_model(
uri : str
The path or URI to save the model to.
format : {"auto", "zip", "folder", "yaml", "json"}
The dataset format to use. "auto" will use [PydanticAutoDataSet][].
The dataset format to use.
"auto" will use [PydanticAutoDataSet][pydantic_kedro.PydanticAutoDataSet].
"""
if not isinstance(model, BaseModel):
raise TypeError(f"Expected Pydantic model, but got {model!r}")
Expand Down
2 changes: 1 addition & 1 deletion src/test/test_ds_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
PydanticZipDataSet,
)

Kls = Union[PydanticFolderDataSet, PydanticZipDataSet]
Kls = Union[PydanticAutoDataSet, PydanticFolderDataSet, PydanticZipDataSet]

dfx = pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"])

Expand Down
102 changes: 102 additions & 0 deletions src/test/test_inheritance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Tests for proper inheritence of classes."""

from pathlib import Path
from typing import Type, Union

import pandas as pd
import pytest
from kedro.extras.datasets.pandas import CSVDataSet, ParquetDataSet
from kedro.extras.datasets.pickle import PickleDataSet
from pydantic import BaseModel

from pydantic_kedro import PydanticFolderDataSet

dfx = pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"])


class BaseA(BaseModel):
"""First model in hierarchy, using Parquet for Pandas."""

class Config:
"""Config for pydantic-kedro."""

arbitrary_types_allowed = True
kedro_map = {pd.DataFrame: ParquetDataSet}


class Model1A(BaseA):
"""Model with Parquet dataset base."""

df: pd.DataFrame


def csv_ds(path: str) -> CSVDataSet:
"""Create a CSV dataset."""
return CSVDataSet(path, save_args=dict(index=False), load_args=dict())


class BaseB(BaseA):
"""Second model in hierarchy, using CSV for Pandas."""

class Config:
"""Config for pydantic-kedro."""

kedro_map = {pd.DataFrame: csv_ds}


class Model1B(BaseB):
"""Model with CSV dataset base."""

df: pd.DataFrame


class BaseC(BaseB):
"""Third model in hierarchy, not providing any kedro_map."""


class Model1C(BaseC):
"""Model with CSV dataset base (again)."""

df: pd.DataFrame


class Fake:
"""Fake class."""


class BaseD(BaseC):
"""Fourth model in hierarchy, providing updated kedro_map (for Fake) and updated default.
However, since we pseudo-inherit `{pd.DataFrame: csv_ds}` mapping from BaseB,
"""

class Config:
"""Config for pydantic-kedro."""

kedro_map = {Fake: PickleDataSet}
kedro_default = ParquetDataSet # Bad idea in practice, but this is for the test


class Model1D(BaseD):
"""Model with CSV dataset base, even though we changed other config parts."""

df: pd.DataFrame


@pytest.mark.parametrize(
["model_type", "ds_type"],
[[Model1A, ParquetDataSet], [Model1B, CSVDataSet], [Model1C, CSVDataSet], [Model1D, CSVDataSet]],
)
def test_pandas_flat_model(
tmpdir,
model_type: Type[Union[Model1A, Model1B, Model1C, Model1D]],
ds_type: Type[Union[ParquetDataSet, CSVDataSet]],
):
"""Test roundtripping of the different dataset models."""
# Create and save model
model = model_type(df=dfx)
path = Path(f"{tmpdir}/model_on_disk")
PydanticFolderDataSet(str(path)).save(model)
# Try loading with the supposed dataframe type
found_df = ds_type(str(path / ".df")).load()
assert isinstance(found_df, pd.DataFrame)

0 comments on commit b3e7774

Please sign in to comment.