diff --git a/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.extra_column.parquet b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.extra_column.parquet new file mode 100644 index 00000000..f30ed667 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.extra_column.parquet differ diff --git a/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.missing_column.parquet b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.missing_column.parquet new file mode 100644 index 00000000..d793a2cd Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.missing_column.parquet differ diff --git a/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.no_metadata.parquet b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.no_metadata.parquet new file mode 100644 index 00000000..08b1a375 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.no_metadata.parquet differ diff --git a/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.parquet b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.parquet new file mode 100644 index 00000000..e0cb8d94 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.parquet differ diff --git a/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.wrong_dtypes.parquet b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.wrong_dtypes.parquet new file mode 100644 index 00000000..2237419d Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/Norder=0/Dir=0/Npix=11.wrong_dtypes.parquet differ diff --git a/tests/hipscat_import/data/malformed_catalogs/bad_schemas/_common_metadata b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/_common_metadata new file mode 100644 index 00000000..a72be7f8 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/_common_metadata differ diff --git a/tests/hipscat_import/data/malformed_catalogs/bad_schemas/_common_metadata.import b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/_common_metadata.import new file mode 100644 index 00000000..dfc011fa Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/_common_metadata.import differ diff --git a/tests/hipscat_import/data/malformed_catalogs/bad_schemas/_metadata b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/_metadata new file mode 100644 index 00000000..fce59f1a Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/bad_schemas/_metadata differ diff --git a/tests/hipscat_import/data/malformed_catalogs/no_rowgroup_stats/Norder=0/Dir=0/Npix=11.parquet b/tests/hipscat_import/data/malformed_catalogs/no_rowgroup_stats/Norder=0/Dir=0/Npix=11.parquet new file mode 100644 index 00000000..11e599de Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/no_rowgroup_stats/Norder=0/Dir=0/Npix=11.parquet differ diff --git a/tests/hipscat_import/data/malformed_catalogs/no_rowgroup_stats/_common_metadata b/tests/hipscat_import/data/malformed_catalogs/no_rowgroup_stats/_common_metadata new file mode 100644 index 00000000..4cf7a744 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/no_rowgroup_stats/_common_metadata differ diff --git a/tests/hipscat_import/data/malformed_catalogs/no_rowgroup_stats/_metadata b/tests/hipscat_import/data/malformed_catalogs/no_rowgroup_stats/_metadata new file mode 100644 index 00000000..925d3f67 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/no_rowgroup_stats/_metadata differ diff --git a/tests/hipscat_import/data/malformed_catalogs/valid_truth/README b/tests/hipscat_import/data/malformed_catalogs/valid_truth/README new file mode 100644 index 00000000..8dd3ea47 --- /dev/null +++ b/tests/hipscat_import/data/malformed_catalogs/valid_truth/README @@ -0,0 +1 @@ +small_sky_object_catalog \ No newline at end of file diff --git a/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/Norder=0/Dir=0/Npix=11.extra_file.parquet b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/Norder=0/Dir=0/Npix=11.extra_file.parquet new file mode 100644 index 00000000..94585fd9 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/Norder=0/Dir=0/Npix=11.extra_file.parquet differ diff --git a/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/Norder=0/Dir=0/Npix=11.extra_rows.parquet b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/Norder=0/Dir=0/Npix=11.extra_rows.parquet new file mode 100644 index 00000000..a51234ca Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/Norder=0/Dir=0/Npix=11.extra_rows.parquet differ diff --git a/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/Norder=0/Dir=0/Npix=11.parquet b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/Norder=0/Dir=0/Npix=11.parquet new file mode 100644 index 00000000..e0cb8d94 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/Norder=0/Dir=0/Npix=11.parquet differ diff --git a/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/_common_metadata b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/_common_metadata new file mode 100644 index 00000000..4cf7a744 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/_common_metadata differ diff --git a/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/_metadata b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/_metadata new file mode 100644 index 00000000..42c25a75 Binary files /dev/null and b/tests/hipscat_import/data/malformed_catalogs/wrong_files_and_rows/_metadata differ diff --git a/tests/hipscat_import/verification/generate_malformed_catalogs.py b/tests/hipscat_import/verification/generate_malformed_catalogs.py new file mode 100644 index 00000000..5809fd89 --- /dev/null +++ b/tests/hipscat_import/verification/generate_malformed_catalogs.py @@ -0,0 +1,229 @@ +import random +import shutil +from pathlib import Path + +import attrs +import pyarrow +import pyarrow.dataset +import pyarrow.parquet + +DATA_DIR = Path(__file__).parent.parent.parent.parent / "tests/hipscat_import/data" +VALID_CATALOG_DIR = DATA_DIR / "small_sky_object_catalog" +MALFORMED_CATALOGS_DIR = DATA_DIR / "malformed_catalogs" + + +def run( + valid_catalog_dir: Path = VALID_CATALOG_DIR, malformed_catalogs_dir: Path = MALFORMED_CATALOGS_DIR +) -> None: + """Generate malformed catalogs to be used as test data for verification. + This only needs to be run once unless/until it is desirable to regenerate the dataset. + """ + Generate.run(valid_catalog_dir=valid_catalog_dir, malformed_catalogs_dir=malformed_catalogs_dir) + + +@attrs.define +class ValidBase: + dataset: pyarrow.dataset.Dataset = attrs.field() + frag: pyarrow.dataset.FileFragment = attrs.field() + tbl: pyarrow.Table = attrs.field() + schema: pyarrow.Schema = attrs.field() + valid_catalog_dir: Path = attrs.field() + malformed_catalogs_dir: Path = attrs.field() + insert_dir: str = attrs.field(factory=str) + + @classmethod + def from_dirs(cls, valid_catalog_dir: Path, malformed_catalogs_dir: Path) -> "ValidBase": + valid_ds = pyarrow.dataset.parquet_dataset(valid_catalog_dir / "_metadata") + valid_frag = next(valid_ds.get_fragments()) + valid_tbl = valid_frag.to_table() + return cls( + dataset=valid_ds, + frag=valid_frag, + tbl=valid_tbl, + schema=valid_tbl.schema, + valid_catalog_dir=valid_catalog_dir, + malformed_catalogs_dir=malformed_catalogs_dir, + ) + + @property + def fmeta(self) -> Path: + return self.malformed_catalogs_dir / self.insert_dir / "_metadata" + + @property + def fcmeta(self) -> Path: + return self.malformed_catalogs_dir / self.insert_dir / "_common_metadata" + + @property + def fdata(self) -> Path: + frag_key = Path(self.frag.path).relative_to(self.valid_catalog_dir) + return self.malformed_catalogs_dir / self.insert_dir / frag_key + + +@attrs.define +class Generate: + def run( + self, + valid_catalog_dir: Path = VALID_CATALOG_DIR, + malformed_catalogs_dir: Path = MALFORMED_CATALOGS_DIR, + ) -> None: + """Generate malformed catalogs to be used as test data for verification. + This only needs to be run once unless/until it is desirable to regenerate the dataset. + """ + if malformed_catalogs_dir.is_dir(): + print(f"Output directory exists. Remove it and try again.\n{malformed_catalogs_dir}") + return + print(f"Generating malformed catalogs from valid catalog at {valid_catalog_dir}...") + + valid = ValidBase.from_dirs( + valid_catalog_dir=valid_catalog_dir, malformed_catalogs_dir=malformed_catalogs_dir + ) + generate = Generate() + generate.valid_truth(valid) + generate.bad_schemas(valid) + generate.no_rowgroup_stats(valid) + generate.wrong_files_and_rows(valid) + + def malformed(self, valid: ValidBase) -> None: + """Case: