-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #64 from NowanIlfideme/feature/spark-tests
Add PySpark support (SparkDataSet)
- Loading branch information
Showing
5 changed files
with
166 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
"""Implementation of local caching. | ||
When we load something from a remote location, we currently need to copy it to | ||
the local disk. This is a limitation of `pydantic-kedro` due to particular | ||
libraries (e.g. Spark) not working with `fsspec` URLs. | ||
Ideally we would just use a `tempfile.TemporaryDirectory`, however because some | ||
libraries do lazy loading (Spark, Polars, so many...) we actually need to | ||
instantiate the files locally. | ||
""" | ||
|
||
import atexit | ||
import logging | ||
import shutil | ||
import tempfile | ||
from pathlib import Path | ||
from typing import Union | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
_INITIAL_TMPDIR: tempfile.TemporaryDirectory = tempfile.TemporaryDirectory(prefix="pydantic_kedro_") | ||
PYD_KEDRO_CACHE_DIR: Path = Path(_INITIAL_TMPDIR.name) | ||
"""Local-ish cache directory for pydantic-kedro. | ||
DO NOT MODIFY - use `set_cache_dir(path)` and `get_cache_dir()` instead. | ||
TODO: Consider using module-level getattr. See https://peps.python.org/pep-0562/ | ||
""" | ||
|
||
|
||
def set_cache_dir(path: Union[Path, str]) -> None: | ||
"""Set the 'local' caching directory for pydantic-kedro. | ||
For Spark and other multi-machine setups, it might make more sense to use | ||
a common mount location. | ||
""" | ||
global PYD_KEDRO_CACHE_DIR, _INITIAL_TMPDIR | ||
|
||
cache_dir = Path(path).resolve() | ||
logger.info("Preparing to set cache directory to: %s", cache_dir) | ||
logger.info("Clearing old path: %s", PYD_KEDRO_CACHE_DIR) | ||
remove_temp_objects() | ||
|
||
if cache_dir.exists(): | ||
logger.warning("Cache path exists, reusing existing path: %s", cache_dir) | ||
else: | ||
logger.warning("Creating cache directory: %s", cache_dir) | ||
cache_dir.mkdir(parents=True, exist_ok=True) | ||
PYD_KEDRO_CACHE_DIR = cache_dir | ||
|
||
|
||
def get_cache_dir() -> Path: | ||
"""Get caching directory for pydantic-kedro.""" | ||
global PYD_KEDRO_CACHE_DIR | ||
|
||
return PYD_KEDRO_CACHE_DIR | ||
|
||
|
||
def remove_temp_objects() -> None: | ||
"""Remove temporary objects at exist. | ||
This will be called at the exit of your application | ||
NOTE: This will NOT handle clearing objects when you change the cache | ||
directory outside of `set_cache_dir()`. | ||
""" | ||
global PYD_KEDRO_CACHE_DIR, _INITIAL_TMPDIR | ||
|
||
shutil.rmtree(PYD_KEDRO_CACHE_DIR, ignore_errors=True) | ||
PYD_KEDRO_CACHE_DIR.unlink(missing_ok=True) | ||
if _INITIAL_TMPDIR is not None: | ||
# We no longer use this directory | ||
_INITIAL_TMPDIR.cleanup() | ||
|
||
|
||
atexit.register(remove_temp_objects) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
"""Test dataset for PySpark specifically.""" | ||
|
||
from typing import Any, Union | ||
|
||
import pytest | ||
from kedro.extras.datasets.spark import SparkDataSet | ||
from pyspark.sql import DataFrame, SparkSession | ||
|
||
from pydantic_kedro import ( | ||
ArbModel, | ||
PydanticAutoDataSet, | ||
PydanticFolderDataSet, | ||
PydanticZipDataSet, | ||
) | ||
|
||
Kls = Union[PydanticAutoDataSet, PydanticFolderDataSet, PydanticZipDataSet] | ||
|
||
|
||
class _SparkModel(ArbModel): | ||
"""Spark model, configured to use SparkDataSet (mult-file parquet).""" | ||
|
||
class Config(ArbModel.Config): | ||
kedro_map = {DataFrame: SparkDataSet} | ||
|
||
|
||
class FlatSparkModel(_SparkModel): | ||
"""Flat model that tests Spark using Picke dataset (default).""" | ||
|
||
df: DataFrame | ||
val: int | ||
|
||
|
||
@pytest.fixture | ||
def spark() -> SparkSession: | ||
"""Create a Spark session for testing.""" | ||
return SparkSession.Builder().appName("pydantic-kedro-testing").getOrCreate() | ||
|
||
|
||
@pytest.mark.parametrize("kls", [PydanticAutoDataSet, PydanticFolderDataSet, PydanticZipDataSet]) | ||
@pytest.mark.parametrize( | ||
"df_raw", | ||
[ | ||
[{"a": 1, "b": 2, "c": 3}], | ||
], | ||
) | ||
def test_spark_flat_model(kls: Kls, df_raw: list[dict[str, Any]], spark: SparkSession, tmpdir): | ||
"""Test roundtripping of the flat Spark model, using Kedro's SparkDataSet.""" | ||
dfx = spark.createDataFrame(df_raw) | ||
mdl = FlatSparkModel(df=dfx, val=1) | ||
paths = [f"{tmpdir}/model_on_disk", f"memory://{tmpdir}/model_in_memory"] | ||
for path in paths: | ||
ds: Kls = kls(path) # type: ignore | ||
ds.save(mdl) | ||
m2 = ds.load() | ||
assert isinstance(m2, FlatSparkModel) | ||
assert m2.df.count() == mdl.df.count() |