From 52b04305d2f98824da956bcd33057a96a5ec9084 Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Tue, 3 Dec 2024 13:30:17 -0500 Subject: [PATCH 1/2] use and test for differential tissue-position values based on Visium/Visium11 --- .../cellxgene_schema/validate.py | 24 ++++++- cellxgene_schema_cli/tests/test_validate.py | 65 ++++++++++++------- 2 files changed, 64 insertions(+), 25 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 60d3a6ba..182609b1 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -4,7 +4,7 @@ import os import re from datetime import datetime -from typing import Dict, List, Mapping, Optional, Union +from typing import Dict, List, Mapping, Optional, Tuple, Union import anndata import matplotlib.colors as mcolors @@ -29,6 +29,8 @@ VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 4992 VISIUM_11MM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 14336 +VISIUM_TISSUE_POSITION_MAX = (77, 127) # (row,col) +VISIUM_11MM_TISSUE_POSITION_MAX = (127, 223) # (row,col) SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE = 2000 SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM = 4000 @@ -57,6 +59,7 @@ def __init__(self, ignore_labels=False): self._visium_and_is_single_true_matrix_size = None self._hires_max_dimension_size = None self._visium_error_suffix = None + self._visium_tissue_position_max = None # Values will be instances of gencode.GeneChecker, # keys will be one of gencode.SupportedOrganisms @@ -122,6 +125,21 @@ def hires_max_dimension_size(self) -> Optional[int]: self._hires_max_dimension_size = SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE return self._hires_max_dimension_size + @property + def tissue_position_maxes(self) -> Tuple[int, int]: + if self._visium_tissue_position_max is None and self._is_visium_and_is_single_true: + # visium 11 has different requirements than other visium + if ( + self.adata.obs["assay_ontology_term_id"] + .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True)) + .astype(bool) + .any() + ): + self._visium_tissue_position_max = VISIUM_11MM_TISSUE_POSITION_MAX + else: + self._visium_tissue_position_max = VISIUM_TISSUE_POSITION_MAX + return self._visium_tissue_position_max + def _is_single(self) -> bool | None: """ Determine value of uns.spatial.is_single. None if non-spatial. @@ -1732,8 +1750,8 @@ def _validate_spatial_tissue_positions(self): :rtype none """ - self._validate_spatial_tissue_position("array_col", 0, 127) - self._validate_spatial_tissue_position("array_row", 0, 77) + self._validate_spatial_tissue_position("array_col", 0, self.tissue_position_maxes[1]) + self._validate_spatial_tissue_position("array_row", 0, self.tissue_position_maxes[0]) self._validate_spatial_tissue_position("in_tissue", 0, 1) def _check_spatial_uns(self): diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index b60f2a19..cd7652bf 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -1,5 +1,6 @@ import hashlib import os +import re import tempfile from typing import Union from unittest import mock @@ -1011,21 +1012,32 @@ def test__validate_tissue_position_required(self, tissue_position_name): validator.adata = adata_visium.copy() validator.adata.obs.pop(tissue_position_name) + # check visium + validator.adata.obs["assay_ontology_term_id"] = "EFO:0010961" validator._check_spatial_obs() assert validator.errors assert ( f"obs['{tissue_position_name}'] {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED}." in validator.errors[0] ) + validator.reset() + + # check visium descendant + validator.adata.obs["assay_ontology_term_id"] = "EFO:0022860" + validator._check_spatial_obs() + assert validator.errors + assert ( + f"obs['{tissue_position_name}'] {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED}." in validator.errors[0] + ) + validator.reset() - @pytest.mark.parametrize("assay_ontology_term_id", ["EFO:0010961", "EFO:0030062"]) + @pytest.mark.parametrize("assay_ontology_term_id", ["EFO:0010961", "EFO:0030062", "EFO:0022860"]) def test__validate_tissue_position_not_required(self, assay_ontology_term_id): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_slide_seqv2.copy() validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id - validator.adata.uns["spatial"]["is_single"] = False + validator.adata.uns["spatial"]["is_single"] = False # setting to false removes the requirement validator.adata.obs["is_primary_data"] = False - validator._check_spatial_obs() assert not validator.errors @@ -1041,43 +1053,52 @@ def test__validate_tissue_position_int_error(self, tissue_position_name): assert validator.errors assert f"obs['{tissue_position_name}'] must be of int type" in validator.errors[0] - @pytest.mark.parametrize( - "tissue_position_name, min, error_message_token", - [ - ("array_col", 0, "between 0 and 127"), - ("array_row", 0, "between 0 and 77"), - ("in_tissue", 0, "0 or 1"), - ], - ) - def test__validate_tissue_position_int_min_error(self, tissue_position_name, min, error_message_token): + @pytest.mark.parametrize("assay_ontology_term_id", ["EFO:0010961", "EFO:0022860", "EFO:0022859"]) + @pytest.mark.parametrize("tissue_position_name, min", [("array_col", 0), ("array_row", 0), ("in_tissue", 0)]) + def test__validate_tissue_position_int_min_error(self, assay_ontology_term_id, tissue_position_name, min): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id validator.adata.obs[tissue_position_name] = min - 1 # Confirm tissue_position is identified as invalid. validator._check_spatial_obs() - assert validator.errors - assert f"obs['{tissue_position_name}'] must be {error_message_token}" in validator.errors[0] + assert ( + re.match(f"^obs\['{tissue_position_name}'\] must be (between )?{min} (and|or) [0-9]+", validator.errors[0]) + is not None + ) @pytest.mark.parametrize( - "tissue_position_name, max, error_message_token", + "assay_ontology_term_id, tissue_position_name, tissue_position_max", [ - ("array_col", 127, "between 0 and 127"), - ("array_row", 77, "between 0 and 77"), - ("in_tissue", 1, "0 or 1"), + ("EFO:0010961", "array_col", 127), + ("EFO:0010961", "array_row", 77), + ("EFO:0022860", "array_col", 223), + ("EFO:0022860", "array_row", 127), + ("EFO:0022859", "array_col", 127), + ("EFO:0022859", "array_row", 77), + ("EFO:0022859", "in_tissue", 1), ], ) - def test__validate_tissue_position_int_max_error(self, tissue_position_name, max, error_message_token): + def test__validate_tissue_position_int_max_error( + self, assay_ontology_term_id, tissue_position_name, tissue_position_max + ): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() - validator.adata.obs[tissue_position_name] = max + 1 + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + validator.adata.obs[tissue_position_name] = tissue_position_max + 1 # Confirm tissue_position is identified as invalid. validator._check_spatial_obs() - assert validator.errors - assert f"obs['{tissue_position_name}'] must be {error_message_token}" in validator.errors[0] + assert ( + re.match( + f"^obs\['{tissue_position_name}'\] must be (between )?[0-9]+ (and|or) {tissue_position_max}", + validator.errors[0], + ) + is not None + ) @pytest.mark.parametrize( "cell_type_ontology_term_id, in_tissue, assay_ontology_term_id", From a024c7a471f68c0fe055b2e4e19f4bcd260ad4e0 Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Fri, 6 Dec 2024 09:53:37 -0500 Subject: [PATCH 2/2] separate out tissue size constants itno standalone variables --- cellxgene_schema_cli/cellxgene_schema/validate.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 182609b1..5510536e 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -29,8 +29,10 @@ VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 4992 VISIUM_11MM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 14336 -VISIUM_TISSUE_POSITION_MAX = (77, 127) # (row,col) -VISIUM_11MM_TISSUE_POSITION_MAX = (127, 223) # (row,col) +VISIUM_TISSUE_POSITION_MAX_ROW = 77 +VISIUM_TISSUE_POSITION_MAX_COL = 127 +VISIUM_11MM_TISSUE_POSITION_MAX_ROW = 127 +VISIUM_11MM_TISSUE_POSITION_MAX_COL = 223 SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE = 2000 SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM = 4000 @@ -135,9 +137,12 @@ def tissue_position_maxes(self) -> Tuple[int, int]: .astype(bool) .any() ): - self._visium_tissue_position_max = VISIUM_11MM_TISSUE_POSITION_MAX + self._visium_tissue_position_max = ( + VISIUM_11MM_TISSUE_POSITION_MAX_ROW, + VISIUM_11MM_TISSUE_POSITION_MAX_COL, + ) else: - self._visium_tissue_position_max = VISIUM_TISSUE_POSITION_MAX + self._visium_tissue_position_max = (VISIUM_TISSUE_POSITION_MAX_ROW, VISIUM_TISSUE_POSITION_MAX_COL) return self._visium_tissue_position_max def _is_single(self) -> bool | None: