From cafb0fc900f48feff4a9a9067d4e969c616538f5 Mon Sep 17 00:00:00 2001
From: Troy Raen <raen@ipac.caltech.edu>
Date: Wed, 16 Oct 2024 06:05:31 -0700
Subject: [PATCH] add verification fixtures

---
 tests/hipscat_import/conftest.py              |  51 ++++++
 tests/hipscat_import/verification/fixture.py  | 114 +++++++++++++
 .../verification/fixture_defs.yaml            | 156 ++++++++++++++++++
 3 files changed, 321 insertions(+)
 create mode 100644 tests/hipscat_import/verification/fixture.py
 create mode 100644 tests/hipscat_import/verification/fixture_defs.yaml

diff --git a/tests/hipscat_import/conftest.py b/tests/hipscat_import/conftest.py
index 1cd8cbf2..6747b874 100644
--- a/tests/hipscat_import/conftest.py
+++ b/tests/hipscat_import/conftest.py
@@ -10,6 +10,8 @@
 import pytest
 from hipscat import pixel_math
 
+from tests.hipscat_import.verification.fixture import VerifierFixture
+
 # pylint: disable=missing-function-docstring, redefined-outer-name
 
 
@@ -300,3 +302,52 @@ def assert_parquet_file_index(file_name, expected_values):
         npt.assert_array_equal(values, expected_values)
 
     return assert_parquet_file_index
+
+
+@pytest.fixture
+def malformed_catalog_dirs(test_data_dir):
+    base_dir = test_data_dir / "malformed_catalogs"
+    catalog_dirs = {dr.name: dr for dr in base_dir.iterdir() if dr.is_dir()}
+    # valid_truth dir contains a README pointing to the valid catalog used to generate malformed ones
+    # resolve the path
+    catalog_dirs["valid_truth"] = test_data_dir / (catalog_dirs["valid_truth"] / "README").read_text()
+    return catalog_dirs
+
+
+@pytest.fixture(params=["valid_truth", "wrong_files"])
+def verifier_for_file_sets(request, malformed_catalog_dirs, tmp_path):
+    return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)
+
+
+@pytest.fixture(params=["valid_truth", "no_rowgroup_stats"])
+def verifier_for_is_valid_catalog(request, malformed_catalog_dirs, tmp_path):
+    return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)
+
+
+@pytest.fixture(params=["valid_truth", "wrong_rows"])
+def verifier_for_num_rows(request, malformed_catalog_dirs, tmp_path):
+    return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)
+
+
+@pytest.fixture(params=["valid_truth", "no_rowgroup_stats"])
+def verifier_for_rowgroup_stats(request, malformed_catalog_dirs, tmp_path):
+    return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)
+
+
+@pytest.fixture(params=["valid_truth", "no_rowgroup_stats"])
+def verifier_for_runner(request, malformed_catalog_dirs, tmp_path):
+    return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)
+
+
+@pytest.fixture(
+    params=[
+        "valid_truth",
+        "schema",
+        "schema_with_md_truth",
+        "schema_with_cmd_truth",
+        "schema_with_import_truth",
+        "schema_with_no_truth",
+    ]
+)
+def verifier_for_schemas(request, malformed_catalog_dirs, tmp_path):
+    return VerifierFixture.from_param(request.param, malformed_catalog_dirs, tmp_path)
diff --git a/tests/hipscat_import/verification/fixture.py b/tests/hipscat_import/verification/fixture.py
new file mode 100644
index 00000000..3d549a66
--- /dev/null
+++ b/tests/hipscat_import/verification/fixture.py
@@ -0,0 +1,114 @@
+"""Run pass/fail tests and generate verification report of existing hipscat table."""
+
+from pathlib import Path
+
+import attrs
+import yaml
+
+from hipscat_import.verification.arguments import VerificationArguments
+from hipscat_import.verification.run_verification import Verifier
+
+
+@attrs.define
+class VerifierFixture:
+    """Class for pytest fixtures for verification tests. Instantiate using the 'from_param' method."""
+
+    test_targets: dict[str, list | dict] = attrs.field(validator=attrs.validators.instance_of(dict))
+    """Dictionary mapping test names to targets."""
+    verifier: Verifier = attrs.field(validator=attrs.validators.instance_of(Verifier))
+    """Verifier instance that the fixture will use to run verification tests."""
+    assert_passed: bool | dict = attrs.field(validator=attrs.validators.instance_of((bool, dict)))
+    """Expected result(s) of the test(s) this verifier will run."""
+
+    @classmethod
+    def from_param(
+        cls, fixture_param: str, malformed_catalog_dirs: dict[str, Path], tmp_path: Path
+    ) -> "VerifierFixture":
+        """Create a VerifierFixture from the given fixture parameter.
+
+        Fixture definitions, including the expected test outcomes, are defined in fixture_defs.yaml.
+
+        Parameters
+        ----------
+            fixture_param : str
+                The fixture parameter key to look up fixture definitions.
+            malformed_catalog_dirs : dict[str, Path]
+                A mapping of malformed test dataset names to their directories.
+            tmp_path : Path
+                A temporary path for output.
+
+        Returns:
+            VerifierFixture: An instance of VerifierFixture configured with the specified parameters.
+        """
+        with open(Path(__file__).parent / "fixture_defs.yaml", "r") as fin:
+            fixture_defs = yaml.safe_load(fin)
+        fixture_def = fixture_defs[fixture_param]
+
+        truth_schema = fixture_def.get("truth_schema")
+        if truth_schema is not None:
+            truth_schema = malformed_catalog_dirs[truth_schema.split("/")[0]] / truth_schema.split("/")[1]
+        args = VerificationArguments(
+            input_catalog_path=malformed_catalog_dirs[fixture_def["input_dir"]],
+            output_path=tmp_path,
+            truth_schema=truth_schema,
+            truth_total_rows=fixture_def.get("truth_total_rows"),
+        )
+
+        fixture = cls(
+            test_targets=fixture_defs["test_targets"],
+            verifier=Verifier.from_args(args),
+            assert_passed=fixture_def["assert_passed"],
+        )
+        return fixture
+
+    @staticmethod
+    def unpack_assert_passed(
+        assert_passed: bool | dict, *, targets: list | None = None
+    ) -> tuple[bool, list] | dict:
+        """Unpack assert_passed and return a tuple or dictionary based on the provided targets.
+
+        Parameters
+        ----------
+            assert_passed : bool, or dict
+                A boolean indicating pass/fail status or a dictionary with target-specific statuses.
+            targets list, or None
+                A list of targets that assert_passed should apply to. If None, the return type is a
+                tuple with a bool indicating whether the test is expected to pass and a list of
+                parquet file suffixes that are expected to fail. Otherwise, the return type is a dict
+                with a key for each target and values indicating pass/fail for the given target.
+
+        Returns
+        -------
+            tuple[bool, list] | dict:
+                - If assert_passed is a boolean:
+                    - If targets is None, returns a tuple (assert_passed, []).
+                    - Else, returns a dict of {target: assert_passed}.
+                - If assert_passed is a dictionary:
+                    - If targets is None, assert_passed is expected to contain a single item with
+                      key=False and value=list of file suffixes that should have failed. The item
+                      is returned as a tuple.
+                    - Else, assert_passed is expected to have a key for every target. The
+                      assert_passed dict is returned.
+
+        Raises
+        ------
+            AssertionError: If assert_passed is a dict but it does not have the expected key(s).
+        """
+
+        if isinstance(assert_passed, bool):
+            if targets is None:
+                return assert_passed, []
+            return {target: assert_passed for target in targets}
+
+        # assert_passed is a dict
+
+        if targets is None:
+            # Expecting a single item with key=False, value=list of file suffixes that should have failed.
+            msg = "Unexpected key. There is probably a bug in the fixture definition."
+            assert set(assert_passed) == {False}, msg
+            return False, assert_passed[False]
+
+        # Expecting one key per target
+        msg = "Unexpected set of targets. There is probably a bug in the fixture definition."
+        assert set(assert_passed) == set(targets), msg
+        return assert_passed
diff --git a/tests/hipscat_import/verification/fixture_defs.yaml b/tests/hipscat_import/verification/fixture_defs.yaml
new file mode 100644
index 00000000..c4333e91
--- /dev/null
+++ b/tests/hipscat_import/verification/fixture_defs.yaml
@@ -0,0 +1,156 @@
+# region ---- Tests and their targets
+# fixture's 'assert_passed' will be coerced to a dict indexed by test and/or target.
+test_targets:
+  num_rows:
+    - _metadata
+    - user total
+  schema:
+    schema:
+      - _common_metadata
+      - _metadata
+      - file footers
+    'schema metadata':
+      - "b'pandas' in _common_metadata"
+      - _common_metadata
+      - _metadata
+      - file footers
+# endregion
+# region ---- Fixture params and their definitions
+# valid_truth should pass all tests
+valid_truth:
+  input_dir: valid_truth
+  truth_schema: valid_truth/_common_metadata
+  truth_total_rows: 131
+  assert_passed: True
+# no_rowgroup_stats is used for test_rowgroup_stats, test_is_valid_catalog, and verification_runner
+no_rowgroup_stats:
+  input_dir: no_rowgroup_stats
+  assert_passed: False
+# schema* is used for test_schemas
+schema:
+  # Case: test bad_schemas catalog given valid_truth schema as truth_schema
+  input_dir: bad_schemas
+  truth_schema: valid_truth/_common_metadata
+  assert_passed:
+    schema:
+      _common_metadata: False  # _common_metadata has wrong dtypes
+      _metadata: True
+      file footers:
+        False:
+          - .extra_column.parquet
+          - .missing_column.parquet
+          - .wrong_dtypes.parquet
+    'schema metadata':
+      "b'pandas' in _common_metadata": True
+      _common_metadata: True
+      _metadata: False  # _metadata is missing b'pandas' metadata
+      file footers:
+        False:
+          - .no_metadata.parquet
+schema_with_cmd_truth:
+  # Case: test bad_schemas catalog given a truth_schema that has the wrong dtypes
+  input_dir: bad_schemas
+  truth_schema: bad_schemas/_common_metadata
+  assert_passed:
+    schema:
+      _common_metadata: True
+      _metadata: False  # truth_schema has wrong dtypes
+      file footers:
+        False:
+          - .extra_column.parquet
+          - .missing_column.parquet
+          - .no_metadata.parquet
+          - .parquet
+    'schema metadata':
+      "b'pandas' in _common_metadata": False  # b'pandas' dtypes != truth_schema dtypes
+      _common_metadata: True
+      _metadata: False  # _metadata is missing b'pandas' metadata
+      file footers:
+        False:
+          - .no_metadata.parquet
+schema_with_import_truth:
+  # Case: Test bad_schemas catalog given a truth_schema with custom metadata that should be preserved, but
+  #       missing hipscat fields and b'pandas' metadata. This schema could have been used during catalog import.
+  input_dir: bad_schemas
+  truth_schema: bad_schemas/_common_metadata.import
+  assert_passed:
+    schema:
+      _common_metadata: False  # _common_metadata has wrong dtypes
+      _metadata: True
+      file footers:
+        False:
+          - .extra_column.parquet
+          - .missing_column.parquet
+          - .wrong_dtypes.parquet
+    'schema metadata':
+      "b'pandas' in _common_metadata": True
+      _common_metadata: False  # _common_metadata is missing the custom metadata
+      _metadata: False  # _metadata is missing all metadata
+      file footers:
+        False:
+          # Every files fails because the custom metadata is missing.
+          - .extra_column.parquet
+          - .missing_column.parquet
+          - .no_metadata.parquet
+          - .parquet
+          - .wrong_dtypes.parquet
+schema_with_no_truth:
+  # Case: Test bad_schemas catalog given no truth_schema
+  input_dir: bad_schemas
+  truth_schema: null  # _common_metadata will be used as the source of truth
+  assert_passed:
+    schema:
+      _common_metadata: null  # this test should not run
+      _metadata: False  # truth_schema has wrong dtypes
+      file footers:
+        False:
+          - .extra_column.parquet
+          - .missing_column.parquet
+          - .no_metadata.parquet
+          - .parquet
+    'schema metadata':
+      "b'pandas' in _common_metadata": False  # b'pandas' dtypes != truth_schema dtypes
+      _common_metadata: null  # this test should not run
+      _metadata: False  # _metadata is missing b'pandas' metadata
+      file footers:
+        False:
+          - .no_metadata.parquet
+schema_with_md_truth:
+  # Case: Test bad_schemas catalog given a truth_schema with no metadata
+  input_dir: bad_schemas
+  truth_schema: bad_schemas/_metadata
+  assert_passed:
+    schema:
+      _common_metadata: False  # _common_metadata has wrong dtypes
+      _metadata: True
+      file footers:
+        False:
+          - .extra_column.parquet
+          - .missing_column.parquet
+          - .wrong_dtypes.parquet
+    'schema metadata':
+      "b'pandas' in _common_metadata": True
+      _common_metadata: True
+      _metadata: False  # _metadata is missing b'pandas' metadata
+      file footers:
+        False:
+          - .no_metadata.parquet
+# wrong_files is used for test_file_sets
+wrong_files:
+  input_dir: wrong_files_and_rows
+  assert_passed:
+    False:
+      - .missing_file.parquet
+      - .extra_file.parquet
+# wrong_rows is used for test_num_rows
+wrong_rows:
+  input_dir: wrong_files_and_rows
+  truth_total_rows: 131
+  assert_passed:
+    _metadata:
+      False:
+        - .missing_file.parquet
+        - .extra_file.parquet
+        - .extra_rows.parquet
+    'user total': False
+# endregion