Merge pull request #25 from NowanIlfideme/feature/inherit-configs

Feature: Inherit configs
NowanIlfideme · May 11, 2023 · b3e7774 · b3e7774
2 parents f35316d + bfc4e1b
commit b3e7774
Show file tree

Hide file tree

Showing 7 changed files with 235 additions and 22 deletions.
diff --git a/docs/arbitrary_types.md b/docs/arbitrary_types.md
@@ -126,15 +126,39 @@ Internally, this uses the `ParquetDataSet` to save the dataframe as an
 as well as reference it from within the JSON file. That means that, unlike
 Pickle, the file isn't "fragile" and will be readable with future versions.
 
-## Known Issues
+## Config Inheritence
 
-1. Currently, the `Config` is not magically inherited by subclasses.
-   That means that you should explicitly inherit `YourType.Config` from `YourType`'s
-   base class if you want to override it. It also means that the `kedro_map`
-   isn't merged for subclasses; you'll need to do this explicitly for now.
-2. Only the top-level model's `Config` is taken into account when serializing
+[Similarly to Pydantic](https://docs.pydantic.dev/latest/usage/model_config/#change-behaviour-globally),
+the `Config` class has a sort of pseudo-inheritence.
+That is, if you define your classes like this:
+
+```python
+class A(BaseModel):
+    class Config:
+        allow_arbitrary_types = True
+        kedro_map = {Foo: FooDataSet}
+
+
+class B(A):
+    class Config:
+        kedro_map = {Bar: BarDataSet}
+
+class C(B):
+    class Config:
+        kedro_map = {Foo: foo_ds_maker}
+        kedro_default = DefaultDataSet
+
+```
+
+Then class `B` will act as if `kedro_map = {Foo: FooDataSet, Bar: BarDataSet}`,
+and class `C` will act as if `kedro_map = {Foo: foo_ds_maker, Bar: BarDataSet}`
+and `kedro_default = DefaultDataSet`
+
+## Considerations
+
+1. Only the top-level model's `Config` is taken into account when serializing
    to a Kedro dataset, ignoring any children's configs.
    This means that all values of a particular type are serialized the same way.
-3. `pydantic` V2 is not supported yet, but V2
+2. `pydantic` V2 is not supported yet, but V2
    [has a different configuration method](https://docs.pydantic.dev/blog/pydantic-v2-alpha/#changes-to-config).
    `pydantic-kedro` might change the configuration method entirely to be more compliant.
diff --git a/src/pydantic_kedro/_internals.py b/src/pydantic_kedro/_internals.py
@@ -0,0 +1,80 @@
+"""Functions for internal use."""
+
+from typing import Callable, Dict, Type
+
+from kedro.extras.datasets.pickle import PickleDataSet
+from kedro.io.core import AbstractDataSet
+from pydantic import BaseModel
+
+
+def get_kedro_map(kls: Type[BaseModel]) -> Dict[Type, Callable[[str], AbstractDataSet]]:
+    """Get type-to-dataset mapper for a Pydantic class."""
+    if not (isinstance(kls, type) and issubclass(kls, BaseModel)):
+        raise TypeError(f"Must pass a BaseModel subclass; got {kls!r}")
+    kedro_map: Dict[Type, Callable[[str], AbstractDataSet]] = {}
+    # Go through bases of `kls` in order
+    base_classes = reversed(kls.mro())
+    for base_i in base_classes:
+        # Get config class (if it's defined)
+        cfg_i = getattr(base_i, "__config__", None)
+        if cfg_i is None:
+            continue
+        # Get kedro_map (if it's defined)
+        upd = getattr(cfg_i, "kedro_map", None)
+        if upd is None:
+            continue
+        elif isinstance(upd, dict):
+            # Detailed checks (to help users fix stuff)
+            bad_keys = []
+            bad_vals = []
+            for k, v in upd.items():
+                if isinstance(k, type):
+                    if callable(v):
+                        kedro_map[k] = v  # TODO: Check callable signature?
+                    else:
+                        bad_vals.append(v)
+                else:
+                    bad_keys.append(k)
+            if len(bad_keys) > 0:
+                raise TypeError(f"Keys in `kedro_map` must be types, but got bad keys: {bad_keys}")
+            if len(bad_vals) > 0:
+                raise TypeError(
+                    "Values in `kedro_map` must be callable (or types),"
+                    f" but got bad values: {bad_vals}"
+                )
+        else:
+            raise TypeError(
+                f"The `kedro_map` in config class {base_i.__qualname__} must be a dict, but got {upd!r}"
+            )
+    return kedro_map
+
+
+def get_kedro_default(kls: Type[BaseModel]) -> Callable[[str], AbstractDataSet]:
+    """Get default Kedro dataset creator."""
+    # Go backwards through bases of `kls` until you find a default value
+    rev_bases = kls.mro()
+    for base_i in rev_bases:
+        # Get config class (if defined)
+        cfg_i = getattr(base_i, "__config__", None)
+        if cfg_i is None:
+            continue
+        # Get kedro_default (if it's defined)
+        default = getattr(cfg_i, "kedro_default", None)
+        if default is None:
+            continue
+        elif callable(default):
+            # Special check for types
+            if isinstance(default, type) and not issubclass(default, AbstractDataSet):
+                raise TypeError(
+                    "The `kedro_default` must be an AbstractDataSet or callable that creates one,"
+                    f" but got {default!r}"
+                )
+            # TODO: Check callable signature?
+            return default
+        else:
+            raise TypeError(
+                "The `kedro_default` must be an AbstractDataSet or callable that creates one,"
+                f" but got {default!r}"
+            )
+
+    return PickleDataSet
diff --git a/src/pydantic_kedro/datasets/folder.py b/src/pydantic_kedro/datasets/folder.py
@@ -13,21 +13,29 @@
 from fsspec import AbstractFileSystem
 from fsspec.core import strip_protocol
 from fsspec.implementations.local import LocalFileSystem
-from kedro.extras.datasets.pickle import PickleDataSet
 from kedro.io.core import AbstractDataSet, parse_dataset_definition
 from pydantic import BaseConfig, BaseModel, Extra, Field
 from pydantic.utils import import_string
 
-# __all__ = ["PydanticFolderDataSet"]
+from pydantic_kedro._internals import get_kedro_default, get_kedro_map
+
+__all__ = ["PydanticFolderDataSet"]
 
 
 DATA_PLACEHOLDER = "__DATA_PLACEHOLDER__"
 
 JsonPath = str  # not a "real" JSON Path, but just `.`-separated
-_Bis = Union[bool, int, str, Path, None]
 
 logger = logging.getLogger(__name__)
 
+# Some ridiculous types to support nested configurations
+_Bis = Union[bool, int, str, Path, None]
+_Dis1 = Dict[str, _Bis]
+_Dis2 = Dict[str, Union[_Bis, _Dis1]]
+_Dis3 = Dict[str, Union[_Bis, _Dis1, _Dis2]]
+_Dis4 = Dict[str, Union[_Bis, _Dis1, _Dis2, _Dis3]]
+# basically, Dict[str, Union[_Bis, Dict[str, Union[_Bis, Dict[str, _Bis]]]]], but better :)
+
 
 class KedroDataSetSpec(BaseModel):
     """Kedro dataset specification. This allows arbitrary extra fields, including versions.
@@ -38,7 +46,7 @@ class KedroDataSetSpec(BaseModel):
 
     type_: str = Field(alias="type")
     relative_path: str
-    args: Dict[str, Union[_Bis, Dict[str, Dict[str, _Bis]]]] = {}
+    args: _Dis4 = {}
 
     class Config(BaseConfig):
         """Internal Pydantic model configuration."""
@@ -252,12 +260,8 @@ def _save_local(self, data: BaseModel, filepath: str) -> None:
 
         # These are used to make datasets for various types
         # See the `kls.Config` class - this is inherited
-        kedro_map: Dict[Type, Callable[[str], AbstractDataSet]] = getattr(
-            kls.__config__, "kedro_map", {}
-        )
-        kedro_default: Callable[[str], AbstractDataSet] = getattr(
-            kls.__config__, "kedro_default", PickleDataSet
-        )
+        kedro_map: Dict[Type, Callable[[str], AbstractDataSet]] = get_kedro_map(kls)
+        kedro_default: Callable[[str], AbstractDataSet] = get_kedro_default(kls)
 
         def make_ds_for(obj: Any, path: str) -> AbstractDataSet:
             for k, v in kedro_map.items():

diff --git a/src/pydantic_kedro/models.py b/src/pydantic_kedro/models.py
@@ -22,10 +22,12 @@ class ArbModel(BaseModel):
     This also supports type hints for `pydantic_kedro` in the configuration:
 
     - `kedro_map`, which maps a type to a dataset constructor to use.
-    - `kedro_default`, which specifies the default dataset type to use.
+    - `kedro_default`, which specifies the default dataset type to use
+      ([kedro.extras.datasets.pickle.PickleDataSet][])
 
-    These are NOT currently inherited (TODO).
-    The default dataset type that's used is [kedro.extras.datasets.pickle.PickleDataSet][]
+    These are pseudo-inherited, see [config-inheritence][].
+    You do not actually need to inherit from `ArbModel` for this to work, however it can help with
+    type completion in your IDE.
     """
 
     Config = ArbConfig
diff --git a/src/pydantic_kedro/utils.py b/src/pydantic_kedro/utils.py
@@ -49,7 +49,8 @@ def save_model(
     uri : str
         The path or URI to save the model to.
     format : {"auto", "zip", "folder", "yaml", "json"}
-        The dataset format to use. "auto" will use [PydanticAutoDataSet][].
+        The dataset format to use.
+        "auto" will use [PydanticAutoDataSet][pydantic_kedro.PydanticAutoDataSet].
     """
     if not isinstance(model, BaseModel):
         raise TypeError(f"Expected Pydantic model, but got {model!r}")

diff --git a/src/test/test_ds_pandas.py b/src/test/test_ds_pandas.py
@@ -13,7 +13,7 @@
     PydanticZipDataSet,
 )
 
-Kls = Union[PydanticFolderDataSet, PydanticZipDataSet]
+Kls = Union[PydanticAutoDataSet, PydanticFolderDataSet, PydanticZipDataSet]
 
 dfx = pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
 

diff --git a/src/test/test_inheritance.py b/src/test/test_inheritance.py
@@ -0,0 +1,102 @@
+"""Tests for proper inheritence of classes."""
+
+from pathlib import Path
+from typing import Type, Union
+
+import pandas as pd
+import pytest
+from kedro.extras.datasets.pandas import CSVDataSet, ParquetDataSet
+from kedro.extras.datasets.pickle import PickleDataSet
+from pydantic import BaseModel
+
+from pydantic_kedro import PydanticFolderDataSet
+
+dfx = pd.DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
+
+
+class BaseA(BaseModel):
+    """First model in hierarchy, using Parquet for Pandas."""
+
+    class Config:
+        """Config for pydantic-kedro."""
+
+        arbitrary_types_allowed = True
+        kedro_map = {pd.DataFrame: ParquetDataSet}
+
+
+class Model1A(BaseA):
+    """Model with Parquet dataset base."""
+
+    df: pd.DataFrame
+
+
+def csv_ds(path: str) -> CSVDataSet:
+    """Create a CSV dataset."""
+    return CSVDataSet(path, save_args=dict(index=False), load_args=dict())
+
+
+class BaseB(BaseA):
+    """Second model in hierarchy, using CSV for Pandas."""
+
+    class Config:
+        """Config for pydantic-kedro."""
+
+        kedro_map = {pd.DataFrame: csv_ds}
+
+
+class Model1B(BaseB):
+    """Model with CSV dataset base."""
+
+    df: pd.DataFrame
+
+
+class BaseC(BaseB):
+    """Third model in hierarchy, not providing any kedro_map."""
+
+
+class Model1C(BaseC):
+    """Model with CSV dataset base (again)."""
+
+    df: pd.DataFrame
+
+
+class Fake:
+    """Fake class."""
+
+
+class BaseD(BaseC):
+    """Fourth model in hierarchy, providing updated kedro_map (for Fake) and updated default.
+
+    However, since we pseudo-inherit `{pd.DataFrame: csv_ds}` mapping from BaseB,
+    """
+
+    class Config:
+        """Config for pydantic-kedro."""
+
+        kedro_map = {Fake: PickleDataSet}
+        kedro_default = ParquetDataSet  # Bad idea in practice, but this is for the test
+
+
+class Model1D(BaseD):
+    """Model with CSV dataset base, even though we changed other config parts."""
+
+    df: pd.DataFrame
+
+
+@pytest.mark.parametrize(
+    ["model_type", "ds_type"],
+    [[Model1A, ParquetDataSet], [Model1B, CSVDataSet], [Model1C, CSVDataSet], [Model1D, CSVDataSet]],
+)
+def test_pandas_flat_model(
+    tmpdir,
+    model_type: Type[Union[Model1A, Model1B, Model1C, Model1D]],
+    ds_type: Type[Union[ParquetDataSet, CSVDataSet]],
+):
+    """Test roundtripping of the different dataset models."""
+    # Create and save model
+    model = model_type(df=dfx)
+    path = Path(f"{tmpdir}/model_on_disk")
+    PydanticFolderDataSet(str(path)).save(model)
+    # Try loading with the supposed dataframe type
+    found_df = ds_type(str(path / ".df")).load()
+    assert isinstance(found_df, pd.DataFrame)