From 857bad96375fc8c4f55bdac820c4f5cbaea31c3f Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Thu, 19 Sep 2024 04:11:13 -0700 Subject: [PATCH] add Verifier unit tests --- .../verification/test_run_verification.py | 193 +++++++++++++++++- .../test_verification_arguments.py | 52 +---- 2 files changed, 193 insertions(+), 52 deletions(-) diff --git a/tests/hipscat_import/verification/test_run_verification.py b/tests/hipscat_import/verification/test_run_verification.py index c672af7f..85666374 100644 --- a/tests/hipscat_import/verification/test_run_verification.py +++ b/tests/hipscat_import/verification/test_run_verification.py @@ -1,7 +1,11 @@ +from pathlib import Path + +import numpy as np +import pandas as pd import pytest import hipscat_import.verification.run_verification as runner -from hipscat_import.verification.arguments import VerificationArguments +from tests.hipscat_import.verification.fixture import VerifierFixture def test_bad_args(): @@ -14,12 +18,183 @@ def test_bad_args(): runner.run(args) -def test_no_implementation(tmp_path, small_sky_object_catalog): - """Womp womp. Test that we don't have a verification pipeline implemented""" - args = VerificationArguments( - input_catalog_path=small_sky_object_catalog, - output_path=tmp_path, - output_artifact_name="small_sky_object_verification_report", +def test_basic_run(verifier_for_runner): + """Verification runner should execute all tests and write reports to file. + Tests should pass with valid catalogs and fail with malformed catalogs.""" + args = verifier_for_runner.verifier.args + # start fresh. delete any existing output files. + filenames = [args.output_report_filename, args.output_distributions_filename] + [(args.output_path / filename).unlink(missing_ok=True) for filename in filenames] + + # run the tests + verifier = runner.run(args) + + # Show that the verification passed or failed as expected + tests_passed = verifier.results_df.passed.all() + assert tests_passed == verifier_for_runner.assert_passed, "runner tests" + + # Show that the output files were or were not written as expected + all_output_written = True + try: + _check_file_output(verifier) + except AssertionError: + all_output_written = False + assert all_output_written == verifier_for_runner.assert_passed, "runner output" + + +def _check_file_output(verifier: runner.Verifier) -> None: + """Verifier should have written two reports to file.""" + # verifier.record_results() writes this file + freport = verifier.args.output_path / verifier.args.output_report_filename + assert freport.is_file(), f"File not found {freport}" + results = pd.read_csv(freport) + # the affected_files lists cause problems. just exclude them + cols = [c for c in results.columns if not c == "affected_files"] + assert results[cols].equals(verifier.results_df[cols]), "Mismatched results" + + # verifier.test_rowgroup_stats() writes this file + fdistributions = verifier.args.output_path / verifier.args.output_distributions_filename + assert fdistributions.is_file(), f"File not found {fdistributions}" + distributions = pd.read_csv(fdistributions, index_col="field") + # values are floats, so use np.allclose + min_passed = np.allclose(distributions.minimum, verifier.distributions_df.minimum, equal_nan=True) + max_passed = np.allclose(distributions.maximum, verifier.distributions_df.maximum, equal_nan=True) + assert min_passed and max_passed, "Mismatched distributions" + + +def test_test_file_sets(verifier_for_file_sets): + """Files on disk should match files in _metadata for catalogs that are not malformed.""" + # run the test + verifier = verifier_for_file_sets.verifier + verifier.results = [] # ensure a fresh start + verifier.test_file_sets() + + # check the result + result = verifier.results_df.squeeze() + _check_one_result(result, verifier_for_file_sets.assert_passed, "file_sets") + + +def test_test_is_valid_catalog(verifier_for_is_valid_catalog): + """hipscat's is_valid_catalog should pass for valid catalogs, else fail.""" + # run the test + verifier = verifier_for_is_valid_catalog.verifier + verifier.results = [] # ensure a fresh start + verifier.test_is_valid_catalog() + + # check the result + result = verifier.results_df.squeeze() + _check_one_result(result, verifier_for_is_valid_catalog.assert_passed, "is_valid_catalog") + + +def test_test_num_rows(verifier_for_num_rows): + """Row count tests should pass for catalogs that are not malformed.""" + # run the test + verifier = verifier_for_num_rows.verifier + verifier.results = [] # ensure a fresh start + verifier.test_num_rows() + + # check the results + targets = verifier_for_num_rows.test_targets["num_rows"] + _check_results(verifier_for_num_rows, targets) + + +def test_test_rowgroup_stats(verifier_for_rowgroup_stats): + """Row group statistics should be present in _metadata for files that are not malformed.""" + # run the test + verifier = verifier_for_rowgroup_stats.verifier + verifier.results = [] # ensure a fresh start + verifier.test_rowgroup_stats() + + # check the result + result = verifier.results_df.squeeze() + _check_one_result(result, verifier_for_rowgroup_stats.assert_passed, test_name="rowgroup_stats") + + +def test_test_schemas(verifier_for_schemas): + """Schemas should contain correct columns, dtypes, and metadata for catalogs that are not malformed.""" + # run the tests + verifier = verifier_for_schemas.verifier + verifier.results = [] # ensure a fresh start + verifier.test_schemas() + + # Two tests were run ('schema' and 'schema metadata') with several targets per test. + test_targets = verifier_for_schemas.test_targets["schema"] # dict maps test -> targets + assert_passed = verifier_for_schemas.unpack_assert_passed( # dict maps test -> assertion + verifier_for_schemas.assert_passed, targets=test_targets.keys() ) - with pytest.raises(NotImplementedError, match="not yet implemented"): - runner.run(args) + + # Check results for each test separately. + for test, targets in test_targets.items(): + results = verifier.results_df.loc[verifier.results_df.test == test] + _check_results(verifier_for_schemas, targets, results=results, assert_passed=assert_passed[test]) + + +def _check_results( + verifier_fixture: VerifierFixture, + targets: list, + *, + results: pd.DataFrame | None = None, + assert_passed: bool | dict | None = None, +) -> None: + """Check the results of verification tests for the given targets. + + Parameters + ---------- + verifier_fixture : VerifierFixture + The fixture containing the verifier and its results. + targets : list + The list of test targets to check. There should be one result per target. + results : pd.DataFrame or None + The test results to check. If None, verifier_fixture.verifier.results_df will be used. + assert_passed : bool, dict, or None + Whether the test should have passed for each target. If None, + verifier_fixture.assert_passed is used. + + Raises + ------ + AssertionError: If any results are unexpected. + """ + results = verifier_fixture.verifier.results_df if results is None else results + assert_passed = verifier_fixture.assert_passed if assert_passed is None else assert_passed + + # dict with one entry per target + _assert_passed = verifier_fixture.unpack_assert_passed(assert_passed, targets=targets) + for target, assertion in _assert_passed.items(): + # Expecting one result per target so squeeze to a series + result = results.loc[results.target.str.startswith(target)].squeeze() + _check_one_result(result, assertion, test_name=target) + + +def _check_one_result(result: pd.Series, assertion: bool | dict | None, test_name: str) -> None: + """Check the result of a single verification test. + + Parameters + ---------- + result : pd.Series + Test result reported by the verifier. + assertion : bool, or dict, or None + The expected outcome of the test. None indicates that the test should have been skipped. + A boolean indicates a simple pass/fail. A dict indicates expected failure and the + list of file suffixes expected in the result's affected_files field. + test_name : str + The name of the test being verified. + + Raises + ------ + AssertionError: If the result does not match the assertion. + """ + if assertion is None: + # This test should have been skipped + msg = f"Unexpected result for: {test_name}. There is probably a bug in the code." + assert len(result.passed) == 0, msg + return + + assert_passed, bad_suffixes = VerifierFixture.unpack_assert_passed(assertion) + + # Show that the target passed or failed the test as expected + assert result.passed if assert_passed else not result.passed, test_name + + # Show that all files that should have failed the test actually did, and no more. + # We're only trying to match file suffixes so strip the rest of the file path out of results. + found_suffixes = ["".join(Path(file).suffixes) for file in result.affected_files] + assert set(bad_suffixes) == set(found_suffixes), test_name + " affected_files" diff --git a/tests/hipscat_import/verification/test_verification_arguments.py b/tests/hipscat_import/verification/test_verification_arguments.py index 8ebd6c81..646af0c2 100644 --- a/tests/hipscat_import/verification/test_verification_arguments.py +++ b/tests/hipscat_import/verification/test_verification_arguments.py @@ -8,63 +8,33 @@ def test_none(): """No arguments provided. Should error for required args.""" - with pytest.raises(ValueError): + with pytest.raises(TypeError): VerificationArguments() def test_empty_required(tmp_path): """*Most* required arguments are provided.""" ## Input path is missing - with pytest.raises(ValueError, match="input_catalog_path"): - VerificationArguments( - output_path=tmp_path, - output_artifact_name="small_sky_object_verification_report", - ) + with pytest.raises(TypeError, match="input_catalog_path"): + VerificationArguments(output_path=tmp_path) def test_invalid_paths(tmp_path, small_sky_object_catalog): """Required arguments are provided, but paths aren't found.""" ## Prove that it works with required args - VerificationArguments( - input_catalog_path=small_sky_object_catalog, - output_path=tmp_path, - output_artifact_name="small_sky_object_verification_report", - ) + VerificationArguments(input_catalog_path=small_sky_object_catalog, output_path=tmp_path) - ## Input path is invalid catalog - with pytest.raises(ValueError, match="input_catalog_path not a valid catalog"): - VerificationArguments( - input_catalog_path="path", - output_path=f"{tmp_path}/path", - output_artifact_name="small_sky_object_verification_report", - ) + ## Input path is not an existing directory + with pytest.raises(ValueError, match="input_catalog_path must be an existing directory"): + VerificationArguments(input_catalog_path="path", output_path=f"{tmp_path}/path") def test_good_paths(tmp_path, small_sky_object_catalog): """Required arguments are provided, and paths are found.""" tmp_path_str = str(tmp_path) - args = VerificationArguments( - input_catalog_path=small_sky_object_catalog, - output_path=tmp_path, - output_artifact_name="small_sky_object_verification_report", - ) + args = VerificationArguments(input_catalog_path=small_sky_object_catalog, output_path=tmp_path) assert args.input_catalog_path == small_sky_object_catalog assert str(args.output_path) == tmp_path_str - assert str(args.tmp_path).startswith(tmp_path_str) - - -def test_catalog_object(tmp_path, small_sky_object_catalog): - """Required arguments are provided, and paths are found.""" - small_sky_catalog_object = Catalog.read_from_hipscat(catalog_path=small_sky_object_catalog) - tmp_path_str = str(tmp_path) - args = VerificationArguments( - input_catalog=small_sky_catalog_object, - output_path=tmp_path, - output_artifact_name="small_sky_object_verification_report", - ) - assert args.input_catalog_path == small_sky_object_catalog - assert str(args.output_path) == tmp_path_str - assert str(args.tmp_path).startswith(tmp_path_str) @pytest.mark.timeout(5) @@ -72,11 +42,7 @@ def test_provenance_info(small_sky_object_catalog, tmp_path): """Verify that provenance info includes verification-specific fields. NB: This is currently the last test in alpha-order, and may require additional time to teardown fixtures.""" - args = VerificationArguments( - input_catalog_path=small_sky_object_catalog, - output_path=tmp_path, - output_artifact_name="small_sky_object_verification_report", - ) + args = VerificationArguments(input_catalog_path=small_sky_object_catalog, output_path=tmp_path) runtime_args = args.provenance_info()["runtime_args"] assert "input_catalog_path" in runtime_args