Skip to content

Commit

Permalink
add verification fixtures
Browse files Browse the repository at this point in the history
  • Loading branch information
troyraen committed Oct 16, 2024
1 parent f226d2a commit cafb0fc
Show file tree
Hide file tree
Showing 3 changed files with 321 additions and 0 deletions.
51 changes: 51 additions & 0 deletions tests/hipscat_import/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import pytest
from hipscat import pixel_math

from tests.hipscat_import.verification.fixture import VerifierFixture

# pylint: disable=missing-function-docstring, redefined-outer-name


Expand Down Expand Up @@ -300,3 +302,52 @@ def assert_parquet_file_index(file_name, expected_values):
npt.assert_array_equal(values, expected_values)

return assert_parquet_file_index


@pytest.fixture
def malformed_catalog_dirs(test_data_dir):
base_dir = test_data_dir / "malformed_catalogs"
catalog_dirs = {dr.name: dr for dr in base_dir.iterdir() if dr.is_dir()}
# valid_truth dir contains a README pointing to the valid catalog used to generate malformed ones
# resolve the path
catalog_dirs["valid_truth"] = test_data_dir / (catalog_dirs["valid_truth"] / "README").read_text()
return catalog_dirs


@pytest.fixture(params=["valid_truth", "wrong_files"])
def verifier_for_file_sets(request, malformed_catalog_dirs, tmp_path):
return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)


@pytest.fixture(params=["valid_truth", "no_rowgroup_stats"])
def verifier_for_is_valid_catalog(request, malformed_catalog_dirs, tmp_path):
return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)


@pytest.fixture(params=["valid_truth", "wrong_rows"])
def verifier_for_num_rows(request, malformed_catalog_dirs, tmp_path):
return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)


@pytest.fixture(params=["valid_truth", "no_rowgroup_stats"])
def verifier_for_rowgroup_stats(request, malformed_catalog_dirs, tmp_path):
return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)


@pytest.fixture(params=["valid_truth", "no_rowgroup_stats"])
def verifier_for_runner(request, malformed_catalog_dirs, tmp_path):
return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)


@pytest.fixture(
params=[
"valid_truth",
"schema",
"schema_with_md_truth",
"schema_with_cmd_truth",
"schema_with_import_truth",
"schema_with_no_truth",
]
)
def verifier_for_schemas(request, malformed_catalog_dirs, tmp_path):
return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)
114 changes: 114 additions & 0 deletions tests/hipscat_import/verification/fixture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""Run pass/fail tests and generate verification report of existing hipscat table."""

from pathlib import Path

import attrs
import yaml

from hipscat_import.verification.arguments import VerificationArguments
from hipscat_import.verification.run_verification import Verifier


@attrs.define
class VerifierFixture:
"""Class for pytest fixtures for verification tests. Instantiate using the 'from_param' method."""

test_targets: dict[str, list | dict] = attrs.field(validator=attrs.validators.instance_of(dict))
"""Dictionary mapping test names to targets."""
verifier: Verifier = attrs.field(validator=attrs.validators.instance_of(Verifier))
"""Verifier instance that the fixture will use to run verification tests."""
assert_passed: bool | dict = attrs.field(validator=attrs.validators.instance_of((bool, dict)))
"""Expected result(s) of the test(s) this verifier will run."""

@classmethod
def from_param(
cls, fixture_param: str, malformed_catalog_dirs: dict[str, Path], tmp_path: Path
) -> "VerifierFixture":
"""Create a VerifierFixture from the given fixture parameter.
Fixture definitions, including the expected test outcomes, are defined in fixture_defs.yaml.
Parameters
----------
fixture_param : str
The fixture parameter key to look up fixture definitions.
malformed_catalog_dirs : dict[str, Path]
A mapping of malformed test dataset names to their directories.
tmp_path : Path
A temporary path for output.
Returns:
VerifierFixture: An instance of VerifierFixture configured with the specified parameters.
"""
with open(Path(__file__).parent / "fixture_defs.yaml", "r") as fin:
fixture_defs = yaml.safe_load(fin)
fixture_def = fixture_defs[fixture_param]

truth_schema = fixture_def.get("truth_schema")
if truth_schema is not None:
truth_schema = malformed_catalog_dirs[truth_schema.split("/")[0]] / truth_schema.split("/")[1]
args = VerificationArguments(
input_catalog_path=malformed_catalog_dirs[fixture_def["input_dir"]],
output_path=tmp_path,
truth_schema=truth_schema,
truth_total_rows=fixture_def.get("truth_total_rows"),
)

fixture = cls(
test_targets=fixture_defs["test_targets"],
verifier=Verifier.from_args(args),
assert_passed=fixture_def["assert_passed"],
)
return fixture

@staticmethod
def unpack_assert_passed(
assert_passed: bool | dict, *, targets: list | None = None
) -> tuple[bool, list] | dict:
"""Unpack assert_passed and return a tuple or dictionary based on the provided targets.
Parameters
----------
assert_passed : bool, or dict
A boolean indicating pass/fail status or a dictionary with target-specific statuses.
targets list, or None
A list of targets that assert_passed should apply to. If None, the return type is a
tuple with a bool indicating whether the test is expected to pass and a list of
parquet file suffixes that are expected to fail. Otherwise, the return type is a dict
with a key for each target and values indicating pass/fail for the given target.
Returns
-------
tuple[bool, list] | dict:
- If assert_passed is a boolean:
- If targets is None, returns a tuple (assert_passed, []).
- Else, returns a dict of {target: assert_passed}.
- If assert_passed is a dictionary:
- If targets is None, assert_passed is expected to contain a single item with
key=False and value=list of file suffixes that should have failed. The item
is returned as a tuple.
- Else, assert_passed is expected to have a key for every target. The
assert_passed dict is returned.
Raises
------
AssertionError: If assert_passed is a dict but it does not have the expected key(s).
"""

if isinstance(assert_passed, bool):
if targets is None:
return assert_passed, []
return {target: assert_passed for target in targets}

# assert_passed is a dict

if targets is None:
# Expecting a single item with key=False, value=list of file suffixes that should have failed.
msg = "Unexpected key. There is probably a bug in the fixture definition."
assert set(assert_passed) == {False}, msg
return False, assert_passed[False]

# Expecting one key per target
msg = "Unexpected set of targets. There is probably a bug in the fixture definition."
assert set(assert_passed) == set(targets), msg
return assert_passed
156 changes: 156 additions & 0 deletions tests/hipscat_import/verification/fixture_defs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# region ---- Tests and their targets
# fixture's 'assert_passed' will be coerced to a dict indexed by test and/or target.
test_targets:
num_rows:
- _metadata
- user total
schema:
schema:
- _common_metadata
- _metadata
- file footers
'schema metadata':
- "b'pandas' in _common_metadata"
- _common_metadata
- _metadata
- file footers
# endregion
# region ---- Fixture params and their definitions
# valid_truth should pass all tests
valid_truth:
input_dir: valid_truth
truth_schema: valid_truth/_common_metadata
truth_total_rows: 131
assert_passed: True
# no_rowgroup_stats is used for test_rowgroup_stats, test_is_valid_catalog, and verification_runner
no_rowgroup_stats:
input_dir: no_rowgroup_stats
assert_passed: False
# schema* is used for test_schemas
schema:
# Case: test bad_schemas catalog given valid_truth schema as truth_schema
input_dir: bad_schemas
truth_schema: valid_truth/_common_metadata
assert_passed:
schema:
_common_metadata: False # _common_metadata has wrong dtypes
_metadata: True
file footers:
False:
- .extra_column.parquet
- .missing_column.parquet
- .wrong_dtypes.parquet
'schema metadata':
"b'pandas' in _common_metadata": True
_common_metadata: True
_metadata: False # _metadata is missing b'pandas' metadata
file footers:
False:
- .no_metadata.parquet
schema_with_cmd_truth:
# Case: test bad_schemas catalog given a truth_schema that has the wrong dtypes
input_dir: bad_schemas
truth_schema: bad_schemas/_common_metadata
assert_passed:
schema:
_common_metadata: True
_metadata: False # truth_schema has wrong dtypes
file footers:
False:
- .extra_column.parquet
- .missing_column.parquet
- .no_metadata.parquet
- .parquet
'schema metadata':
"b'pandas' in _common_metadata": False # b'pandas' dtypes != truth_schema dtypes
_common_metadata: True
_metadata: False # _metadata is missing b'pandas' metadata
file footers:
False:
- .no_metadata.parquet
schema_with_import_truth:
# Case: Test bad_schemas catalog given a truth_schema with custom metadata that should be preserved, but
# missing hipscat fields and b'pandas' metadata. This schema could have been used during catalog import.
input_dir: bad_schemas
truth_schema: bad_schemas/_common_metadata.import
assert_passed:
schema:
_common_metadata: False # _common_metadata has wrong dtypes
_metadata: True
file footers:
False:
- .extra_column.parquet
- .missing_column.parquet
- .wrong_dtypes.parquet
'schema metadata':
"b'pandas' in _common_metadata": True
_common_metadata: False # _common_metadata is missing the custom metadata
_metadata: False # _metadata is missing all metadata
file footers:
False:
# Every files fails because the custom metadata is missing.
- .extra_column.parquet
- .missing_column.parquet
- .no_metadata.parquet
- .parquet
- .wrong_dtypes.parquet
schema_with_no_truth:
# Case: Test bad_schemas catalog given no truth_schema
input_dir: bad_schemas
truth_schema: null # _common_metadata will be used as the source of truth
assert_passed:
schema:
_common_metadata: null # this test should not run
_metadata: False # truth_schema has wrong dtypes
file footers:
False:
- .extra_column.parquet
- .missing_column.parquet
- .no_metadata.parquet
- .parquet
'schema metadata':
"b'pandas' in _common_metadata": False # b'pandas' dtypes != truth_schema dtypes
_common_metadata: null # this test should not run
_metadata: False # _metadata is missing b'pandas' metadata
file footers:
False:
- .no_metadata.parquet
schema_with_md_truth:
# Case: Test bad_schemas catalog given a truth_schema with no metadata
input_dir: bad_schemas
truth_schema: bad_schemas/_metadata
assert_passed:
schema:
_common_metadata: False # _common_metadata has wrong dtypes
_metadata: True
file footers:
False:
- .extra_column.parquet
- .missing_column.parquet
- .wrong_dtypes.parquet
'schema metadata':
"b'pandas' in _common_metadata": True
_common_metadata: True
_metadata: False # _metadata is missing b'pandas' metadata
file footers:
False:
- .no_metadata.parquet
# wrong_files is used for test_file_sets
wrong_files:
input_dir: wrong_files_and_rows
assert_passed:
False:
- .missing_file.parquet
- .extra_file.parquet
# wrong_rows is used for test_num_rows
wrong_rows:
input_dir: wrong_files_and_rows
truth_total_rows: 131
assert_passed:
_metadata:
False:
- .missing_file.parquet
- .extra_file.parquet
- .extra_rows.parquet
'user total': False
# endregion

0 comments on commit cafb0fc

Please sign in to comment.