From cafb0fc900f48feff4a9a9067d4e969c616538f5 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Wed, 16 Oct 2024 06:05:31 -0700 Subject: [PATCH] add verification fixtures --- tests/hipscat_import/conftest.py | 51 ++++++ tests/hipscat_import/verification/fixture.py | 114 +++++++++++++ .../verification/fixture_defs.yaml | 156 ++++++++++++++++++ 3 files changed, 321 insertions(+) create mode 100644 tests/hipscat_import/verification/fixture.py create mode 100644 tests/hipscat_import/verification/fixture_defs.yaml diff --git a/tests/hipscat_import/conftest.py b/tests/hipscat_import/conftest.py index 1cd8cbf2..6747b874 100644 --- a/tests/hipscat_import/conftest.py +++ b/tests/hipscat_import/conftest.py @@ -10,6 +10,8 @@ import pytest from hipscat import pixel_math +from tests.hipscat_import.verification.fixture import VerifierFixture + # pylint: disable=missing-function-docstring, redefined-outer-name @@ -300,3 +302,52 @@ def assert_parquet_file_index(file_name, expected_values): npt.assert_array_equal(values, expected_values) return assert_parquet_file_index + + +@pytest.fixture +def malformed_catalog_dirs(test_data_dir): + base_dir = test_data_dir / "malformed_catalogs" + catalog_dirs = {dr.name: dr for dr in base_dir.iterdir() if dr.is_dir()} + # valid_truth dir contains a README pointing to the valid catalog used to generate malformed ones + # resolve the path + catalog_dirs["valid_truth"] = test_data_dir / (catalog_dirs["valid_truth"] / "README").read_text() + return catalog_dirs + + +@pytest.fixture(params=["valid_truth", "wrong_files"]) +def verifier_for_file_sets(request, malformed_catalog_dirs, tmp_path): + return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path) + + +@pytest.fixture(params=["valid_truth", "no_rowgroup_stats"]) +def verifier_for_is_valid_catalog(request, malformed_catalog_dirs, tmp_path): + return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path) + + +@pytest.fixture(params=["valid_truth", "wrong_rows"]) +def verifier_for_num_rows(request, malformed_catalog_dirs, tmp_path): + return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path) + + +@pytest.fixture(params=["valid_truth", "no_rowgroup_stats"]) +def verifier_for_rowgroup_stats(request, malformed_catalog_dirs, tmp_path): + return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path) + + +@pytest.fixture(params=["valid_truth", "no_rowgroup_stats"]) +def verifier_for_runner(request, malformed_catalog_dirs, tmp_path): + return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path) + + +@pytest.fixture( + params=[ + "valid_truth", + "schema", + "schema_with_md_truth", + "schema_with_cmd_truth", + "schema_with_import_truth", + "schema_with_no_truth", + ] +) +def verifier_for_schemas(request, malformed_catalog_dirs, tmp_path): + return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path) diff --git a/tests/hipscat_import/verification/fixture.py b/tests/hipscat_import/verification/fixture.py new file mode 100644 index 00000000..3d549a66 --- /dev/null +++ b/tests/hipscat_import/verification/fixture.py @@ -0,0 +1,114 @@ +"""Run pass/fail tests and generate verification report of existing hipscat table.""" + +from pathlib import Path + +import attrs +import yaml + +from hipscat_import.verification.arguments import VerificationArguments +from hipscat_import.verification.run_verification import Verifier + + +@attrs.define +class VerifierFixture: + """Class for pytest fixtures for verification tests. Instantiate using the 'from_param' method.""" + + test_targets: dict[str, list | dict] = attrs.field(validator=attrs.validators.instance_of(dict)) + """Dictionary mapping test names to targets.""" + verifier: Verifier = attrs.field(validator=attrs.validators.instance_of(Verifier)) + """Verifier instance that the fixture will use to run verification tests.""" + assert_passed: bool | dict = attrs.field(validator=attrs.validators.instance_of((bool, dict))) + """Expected result(s) of the test(s) this verifier will run.""" + + @classmethod + def from_param( + cls, fixture_param: str, malformed_catalog_dirs: dict[str, Path], tmp_path: Path + ) -> "VerifierFixture": + """Create a VerifierFixture from the given fixture parameter. + + Fixture definitions, including the expected test outcomes, are defined in fixture_defs.yaml. + + Parameters + ---------- + fixture_param : str + The fixture parameter key to look up fixture definitions. + malformed_catalog_dirs : dict[str, Path] + A mapping of malformed test dataset names to their directories. + tmp_path : Path + A temporary path for output. + + Returns: + VerifierFixture: An instance of VerifierFixture configured with the specified parameters. + """ + with open(Path(__file__).parent / "fixture_defs.yaml", "r") as fin: + fixture_defs = yaml.safe_load(fin) + fixture_def = fixture_defs[fixture_param] + + truth_schema = fixture_def.get("truth_schema") + if truth_schema is not None: + truth_schema = malformed_catalog_dirs[truth_schema.split("/")[0]] / truth_schema.split("/")[1] + args = VerificationArguments( + input_catalog_path=malformed_catalog_dirs[fixture_def["input_dir"]], + output_path=tmp_path, + truth_schema=truth_schema, + truth_total_rows=fixture_def.get("truth_total_rows"), + ) + + fixture = cls( + test_targets=fixture_defs["test_targets"], + verifier=Verifier.from_args(args), + assert_passed=fixture_def["assert_passed"], + ) + return fixture + + @staticmethod + def unpack_assert_passed( + assert_passed: bool | dict, *, targets: list | None = None + ) -> tuple[bool, list] | dict: + """Unpack assert_passed and return a tuple or dictionary based on the provided targets. + + Parameters + ---------- + assert_passed : bool, or dict + A boolean indicating pass/fail status or a dictionary with target-specific statuses. + targets list, or None + A list of targets that assert_passed should apply to. If None, the return type is a + tuple with a bool indicating whether the test is expected to pass and a list of + parquet file suffixes that are expected to fail. Otherwise, the return type is a dict + with a key for each target and values indicating pass/fail for the given target. + + Returns + ------- + tuple[bool, list] | dict: + - If assert_passed is a boolean: + - If targets is None, returns a tuple (assert_passed, []). + - Else, returns a dict of {target: assert_passed}. + - If assert_passed is a dictionary: + - If targets is None, assert_passed is expected to contain a single item with + key=False and value=list of file suffixes that should have failed. The item + is returned as a tuple. + - Else, assert_passed is expected to have a key for every target. The + assert_passed dict is returned. + + Raises + ------ + AssertionError: If assert_passed is a dict but it does not have the expected key(s). + """ + + if isinstance(assert_passed, bool): + if targets is None: + return assert_passed, [] + return {target: assert_passed for target in targets} + + # assert_passed is a dict + + if targets is None: + # Expecting a single item with key=False, value=list of file suffixes that should have failed. + msg = "Unexpected key. There is probably a bug in the fixture definition." + assert set(assert_passed) == {False}, msg + return False, assert_passed[False] + + # Expecting one key per target + msg = "Unexpected set of targets. There is probably a bug in the fixture definition." + assert set(assert_passed) == set(targets), msg + return assert_passed diff --git a/tests/hipscat_import/verification/fixture_defs.yaml b/tests/hipscat_import/verification/fixture_defs.yaml new file mode 100644 index 00000000..c4333e91 --- /dev/null +++ b/tests/hipscat_import/verification/fixture_defs.yaml @@ -0,0 +1,156 @@ +# region ---- Tests and their targets +# fixture's 'assert_passed' will be coerced to a dict indexed by test and/or target. +test_targets: + num_rows: + - _metadata + - user total + schema: + schema: + - _common_metadata + - _metadata + - file footers + 'schema metadata': + - "b'pandas' in _common_metadata" + - _common_metadata + - _metadata + - file footers +# endregion +# region ---- Fixture params and their definitions +# valid_truth should pass all tests +valid_truth: + input_dir: valid_truth + truth_schema: valid_truth/_common_metadata + truth_total_rows: 131 + assert_passed: True +# no_rowgroup_stats is used for test_rowgroup_stats, test_is_valid_catalog, and verification_runner +no_rowgroup_stats: + input_dir: no_rowgroup_stats + assert_passed: False +# schema* is used for test_schemas +schema: + # Case: test bad_schemas catalog given valid_truth schema as truth_schema + input_dir: bad_schemas + truth_schema: valid_truth/_common_metadata + assert_passed: + schema: + _common_metadata: False # _common_metadata has wrong dtypes + _metadata: True + file footers: + False: + - .extra_column.parquet + - .missing_column.parquet + - .wrong_dtypes.parquet + 'schema metadata': + "b'pandas' in _common_metadata": True + _common_metadata: True + _metadata: False # _metadata is missing b'pandas' metadata + file footers: + False: + - .no_metadata.parquet +schema_with_cmd_truth: + # Case: test bad_schemas catalog given a truth_schema that has the wrong dtypes + input_dir: bad_schemas + truth_schema: bad_schemas/_common_metadata + assert_passed: + schema: + _common_metadata: True + _metadata: False # truth_schema has wrong dtypes + file footers: + False: + - .extra_column.parquet + - .missing_column.parquet + - .no_metadata.parquet + - .parquet + 'schema metadata': + "b'pandas' in _common_metadata": False # b'pandas' dtypes != truth_schema dtypes + _common_metadata: True + _metadata: False # _metadata is missing b'pandas' metadata + file footers: + False: + - .no_metadata.parquet +schema_with_import_truth: + # Case: Test bad_schemas catalog given a truth_schema with custom metadata that should be preserved, but + # missing hipscat fields and b'pandas' metadata. This schema could have been used during catalog import. + input_dir: bad_schemas + truth_schema: bad_schemas/_common_metadata.import + assert_passed: + schema: + _common_metadata: False # _common_metadata has wrong dtypes + _metadata: True + file footers: + False: + - .extra_column.parquet + - .missing_column.parquet + - .wrong_dtypes.parquet + 'schema metadata': + "b'pandas' in _common_metadata": True + _common_metadata: False # _common_metadata is missing the custom metadata + _metadata: False # _metadata is missing all metadata + file footers: + False: + # Every files fails because the custom metadata is missing. + - .extra_column.parquet + - .missing_column.parquet + - .no_metadata.parquet + - .parquet + - .wrong_dtypes.parquet +schema_with_no_truth: + # Case: Test bad_schemas catalog given no truth_schema + input_dir: bad_schemas + truth_schema: null # _common_metadata will be used as the source of truth + assert_passed: + schema: + _common_metadata: null # this test should not run + _metadata: False # truth_schema has wrong dtypes + file footers: + False: + - .extra_column.parquet + - .missing_column.parquet + - .no_metadata.parquet + - .parquet + 'schema metadata': + "b'pandas' in _common_metadata": False # b'pandas' dtypes != truth_schema dtypes + _common_metadata: null # this test should not run + _metadata: False # _metadata is missing b'pandas' metadata + file footers: + False: + - .no_metadata.parquet +schema_with_md_truth: + # Case: Test bad_schemas catalog given a truth_schema with no metadata + input_dir: bad_schemas + truth_schema: bad_schemas/_metadata + assert_passed: + schema: + _common_metadata: False # _common_metadata has wrong dtypes + _metadata: True + file footers: + False: + - .extra_column.parquet + - .missing_column.parquet + - .wrong_dtypes.parquet + 'schema metadata': + "b'pandas' in _common_metadata": True + _common_metadata: True + _metadata: False # _metadata is missing b'pandas' metadata + file footers: + False: + - .no_metadata.parquet +# wrong_files is used for test_file_sets +wrong_files: + input_dir: wrong_files_and_rows + assert_passed: + False: + - .missing_file.parquet + - .extra_file.parquet +# wrong_rows is used for test_num_rows +wrong_rows: + input_dir: wrong_files_and_rows + truth_total_rows: 131 + assert_passed: + _metadata: + False: + - .missing_file.parquet + - .extra_file.parquet + - .extra_rows.parquet + 'user total': False +# endregion