From fc210dfc812f9631d95bc337f28a8f2fa751e2bf Mon Sep 17 00:00:00 2001 From: Bento007 Date: Thu, 14 Nov 2024 13:35:49 -0800 Subject: [PATCH 01/28] =?UTF-8?q?Bump=20version:=205.2.0=20=E2=86=92=205.2?= =?UTF-8?q?.1-rc.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- cellxgene_schema_cli/cellxgene_schema/__init__.py | 2 +- cellxgene_schema_cli/setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5485ee1c..3bd4ca9e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 5.2.0 +current_version = 5.2.1-rc.0 commit = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?Prc)\.(?P\d+))? serialize = diff --git a/cellxgene_schema_cli/cellxgene_schema/__init__.py b/cellxgene_schema_cli/cellxgene_schema/__init__.py index 6c235c59..f8407c1e 100644 --- a/cellxgene_schema_cli/cellxgene_schema/__init__.py +++ b/cellxgene_schema_cli/cellxgene_schema/__init__.py @@ -1 +1 @@ -__version__ = "5.2.0" +__version__ = "5.2.1-rc.0" diff --git a/cellxgene_schema_cli/setup.py b/cellxgene_schema_cli/setup.py index 2053acd6..9a62ca6e 100644 --- a/cellxgene_schema_cli/setup.py +++ b/cellxgene_schema_cli/setup.py @@ -5,7 +5,7 @@ setup( name="cellxgene-schema", - version="5.2.0", + version="5.2.1-rc.0", url="https://github.com/chanzuckerberg/single-cell-curation", license="MIT", author="Chan Zuckerberg Initiative", From d03765b1c0fdcf3c0b654a5699841bf37063d36e Mon Sep 17 00:00:00 2001 From: Bento007 Date: Thu, 14 Nov 2024 13:37:12 -0800 Subject: [PATCH 02/28] =?UTF-8?q?Bump=20version:=205.2.1-rc.0=20=E2=86=92?= =?UTF-8?q?=205.2.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- cellxgene_schema_cli/cellxgene_schema/__init__.py | 2 +- cellxgene_schema_cli/setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 3bd4ca9e..13a12d63 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 5.2.1-rc.0 +current_version = 5.2.1 commit = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?Prc)\.(?P\d+))? serialize = diff --git a/cellxgene_schema_cli/cellxgene_schema/__init__.py b/cellxgene_schema_cli/cellxgene_schema/__init__.py index f8407c1e..98886d26 100644 --- a/cellxgene_schema_cli/cellxgene_schema/__init__.py +++ b/cellxgene_schema_cli/cellxgene_schema/__init__.py @@ -1 +1 @@ -__version__ = "5.2.1-rc.0" +__version__ = "5.2.1" diff --git a/cellxgene_schema_cli/setup.py b/cellxgene_schema_cli/setup.py index 9a62ca6e..ce67b058 100644 --- a/cellxgene_schema_cli/setup.py +++ b/cellxgene_schema_cli/setup.py @@ -5,7 +5,7 @@ setup( name="cellxgene-schema", - version="5.2.1-rc.0", + version="5.2.1", url="https://github.com/chanzuckerberg/single-cell-curation", license="MIT", author="Chan Zuckerberg Initiative", From b256322b342b267e1288cfc3a68fd69b04bd25e0 Mon Sep 17 00:00:00 2001 From: Brian Raymor Date: Fri, 15 Nov 2024 16:24:16 -0800 Subject: [PATCH 03/28] removed strongly recommended from development stage (#1110) --- schema/drafts/5.2.1-experimental.md | 2 +- schema/drafts/5.3.0.md | 54 +++-------------------------- 2 files changed, 6 insertions(+), 50 deletions(-) diff --git a/schema/drafts/5.2.1-experimental.md b/schema/drafts/5.2.1-experimental.md index 8a0d490b..4275dfe6 100644 --- a/schema/drafts/5.2.1-experimental.md +++ b/schema/drafts/5.2.1-experimental.md @@ -190,7 +190,7 @@ The following gene annotation dependencies are *pinned* for this version of the "NCBITaxon:7227"
for Drosophila melanogaster - MUST be ither the most accurate descendant of FBbt:00007002 for cell
or "unknown" when: + MUST be either the most accurate descendant of FBbt:00007002 for cell
or "unknown" when:
  • no appropriate term can be found (e.g. the cell type is unknown) diff --git a/schema/drafts/5.3.0.md b/schema/drafts/5.3.0.md index c6aa47b3..cbbdb46a 100644 --- a/schema/drafts/5.3.0.md +++ b/schema/drafts/5.3.0.md @@ -503,55 +503,11 @@ Curators MUST annotate the following columns in the `obs` dataframe: Value - categorical with str categories. If unavailable, this MUST be "unknown". -

    If organism_ontolology_term_id is "NCBITaxon:9606" for Homo sapiens, this MUST be the most accurate descendant of HsapDv:0000001 for life cycle with the following STRONGLY RECOMMENDED: -

    - - - - - - - - - - - - - - - - - - - - - - - - -
    ForUse
    Embryonic stageA term from the set of Carnegie stages 1-23
    (up to 8 weeks after conception; e.g. HsapDv:0000003)
    Fetal developmentA term from the set of 9 to 38 week post-fertilization human stages
    (9 weeks after conception and before birth; e.g. HsapDv:0000046)
    After birth for the
    first 12 months
    A term from the set of 1 to 12 month-old human stages
    (e.g. HsapDv:0000273)
    After the first 12
    months post-birth
    A term from the set of year-old human stages
    (e.g. HsapDv:0000246)
    -
    If organism_ontolology_term_id is - "NCBITaxon:10090" for Mus musculus, this MUST be the accurate descendant of MmusDv:0000001 for life cycle with the following STRONGLY RECOMMENDED: -

    - - - - - - - - - - - - - - - - -
    ForUse
    From the time of conception
    to 1 month after birth
    A term from the set of Theiler stages
    (e.g. MmusDv:0000003)
    From 2 months after birthA term from the set of month-old stages
    (e.g. MmusDv:0000062)
    -
    Otherwise, for all other organisms this MUST be the most accurate descendant of UBERON:0000105 for life cycle stage, excluding UBERON:0000071 for death stage. + categorical with str categories. If unavailable, this MUST be "unknown".

    + If organism_ontolology_term_id is "NCBITaxon:9606" for Homo sapiens, this MUST be the most accurate descendant of HsapDv:0000001 for life cycle.

    + If organism_ontolology_term_id is + "NCBITaxon:10090" for Mus musculus, this MUST be the accurate descendant of MmusDv:0000001 for life cycle.

    + Otherwise, for all other organisms this MUST be the most accurate descendant of UBERON:0000105 for life cycle stage, excluding UBERON:0000071 for death stage. From 30498f06c53950baf5ef0b65f4f95d2977a6f37d Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Mon, 18 Nov 2024 16:12:45 -0500 Subject: [PATCH 04/28] chore: remove seurat references in validator (#1113) Co-authored-by: Evan Molinelli --- .../schema_definitions/schema_definition.yaml | 2 - .../cellxgene_schema/validate.py | 79 ++----------------- cellxgene_schema_cli/tests/test_validate.py | 64 +-------------- 3 files changed, 9 insertions(+), 136 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml index bca3da68..28a3fad5 100644 --- a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml +++ b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml @@ -2,8 +2,6 @@ title: Corpora schema version 5.X.X type: anndata # If sparsity of any expression matrix is greater than this and not csr sparse matrix, then there will be warning. sparsity: 0.5 -# If the R array will exceed this number in size, then Seurat conversion will fail -max_size_for_seurat: 2147483647 # 2^31 - 1 (max value for 4-byte signed int) # Perform the checks for "raw" requirements IF: raw: obs: diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 4aa81adc..6869a9df 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -54,7 +54,6 @@ def reset(self): self.is_valid = False self.h5ad_path = "" self._raw_layer_exists = None - self.is_seurat_convertible: bool = True self.is_spatial = None self.is_visium = None self.is_visium_and_is_single_true = None @@ -926,65 +925,6 @@ def _validate_sparsity(self): f"to use this type of matrix for the given sparsity." ) - def _validate_seurat_convertibility(self): - """ - Use length of component matrices to determine if the anndata object will be unable to be converted to Seurat by - virtue of the R language's array size limit (4-byte signed int length). Add warning for each matrix which is - too large. - rtype: None - """ - # Seurat conversion is not supported for Visium datasets. - if self._is_visium(): - self.warnings.append( - "Datasets with assay_ontology_term_id 'EFO:0010961' (Visium Spatial Gene Expression) are not compatible with Seurat." - ) - self.is_seurat_convertible = False - return - - to_validate = [(self.adata.X, "X")] - # check if there's raw data - if self.adata.raw: - to_validate.append((self.adata.raw.X, "raw.X")) - # Check length of component arrays - for matrix, matrix_name in to_validate: - matrix_format = get_matrix_format(self.adata, matrix) - if matrix_format in SPARSE_MATRIX_TYPES: - effective_r_array_size = self._count_matrix_nonzero(matrix_name, matrix) - is_sparse = True - elif matrix_format == "dense": - effective_r_array_size = max(matrix.shape) - is_sparse = False - else: - self.warnings.append( - f"Unable to verify seurat convertibility for matrix {matrix_name} " f"of type {type(matrix)}" - ) - continue - - if effective_r_array_size > self.schema_def["max_size_for_seurat"]: - if is_sparse: - self.warnings.append( - f"This dataset cannot be converted to the .rds (Seurat v4) format. " - f"{effective_r_array_size} nonzero elements in matrix {matrix_name} exceed the " - f"limitations in the R dgCMatrix sparse matrix class (2^31 - 1 nonzero " - f"elements)." - ) - else: - self.warnings.append( - f"This dataset cannot be converted to the .rds (Seurat v4) format. " - f"{effective_r_array_size} elements in at least one dimension of matrix " - f"{matrix_name} exceed the limitations in the R dgCMatrix sparse matrix class " - f"(2^31 - 1 nonzero elements)." - ) - - self.is_seurat_convertible = False - - if self.adata.raw and self.adata.raw.X.shape[1] != self.adata.raw.var.shape[0]: - self.errors.append( - "This dataset has a mismatch between 1) the number of features in raw.X and 2) the number of features " - "in raw.var. These counts must be identical." - ) - self.is_seurat_convertible = False - def _validate_obsm(self): """ Validates the embedding dictionary -- it checks that all values of adata.obsm are numpy arrays with the correct @@ -1887,10 +1827,6 @@ def _deep_check(self): # Checks spatial self._check_spatial() - # Checks Seurat convertibility - logger.debug("Validating Seurat convertibility...") - self._validate_seurat_convertibility() - # Checks each component for component_name, component_def in self.schema_def["components"].items(): logger.debug(f"Validating component: {component_name}") @@ -1976,7 +1912,7 @@ def validate( add_labels_file: str = None, ignore_labels: bool = False, verbose: bool = False, -) -> (bool, list, bool): +) -> (bool, list): from .write_labels import AnnDataLabelAppender """ @@ -1985,8 +1921,7 @@ def validate( :param Union[str, bytes, os.PathLike] h5ad_path: Path to h5ad file to validate :param str add_labels_file: Path to new h5ad file with ontology/gene labels added - :return (True, [], ) if successful validation, (False, [list_of_errors], ) otherwise; last bool is for - seurat convertibility + :return (True, []) if successful validation, (False, [list_of_errors]) otherwise :rtype tuple """ @@ -2004,7 +1939,7 @@ def validate( # Stop if validation was unsuccessful if not validator.is_valid: - return False, validator.errors, validator.is_seurat_convertible + return False, validator.errors if add_labels_file: label_start = datetime.now() @@ -2015,10 +1950,6 @@ def validate( f"{writer.was_writing_successful}" ) - return ( - validator.is_valid and writer.was_writing_successful, - validator.errors + writer.errors, - validator.is_seurat_convertible, - ) + return (validator.is_valid and writer.was_writing_successful, validator.errors + writer.errors) - return True, validator.errors, validator.is_seurat_convertible + return True, validator.errors diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index cead4b33..819cde43 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -27,10 +27,8 @@ adata_visium, adata_with_labels, good_obs, - good_obsm, good_uns, good_uns_with_visium_spatial, - good_var, h5ad_invalid, h5ad_valid, visium_library_id, @@ -297,7 +295,7 @@ def test__validate_with_h5ad_valid_and_labels(self): with tempfile.TemporaryDirectory() as temp_dir: labels_path = "/".join([temp_dir, "labels.h5ad"]) - success, errors, is_seurat_convertible = validate(h5ad_valid, labels_path) + success, errors = validate(h5ad_valid, labels_path) import anndata as ad @@ -306,36 +304,32 @@ def test__validate_with_h5ad_valid_and_labels(self): assert adata.raw.X.has_canonical_format assert success assert not errors - assert is_seurat_convertible assert os.path.exists(labels_path) expected_hash = "55fbc095218a01cad33390f534d6690af0ecd6593f27d7cd4d26e91072ea8835" original_hash = self.hash_file(h5ad_valid) assert original_hash != expected_hash, "Writing labels did not change the dataset from the original." def test__validate_with_h5ad_valid_and_without_labels(self): - success, errors, is_seurat_convertible = validate(h5ad_valid) + success, errors = validate(h5ad_valid) assert success assert not errors - assert is_seurat_convertible def test__validate_with_h5ad_invalid_and_with_labels(self): with tempfile.TemporaryDirectory() as temp_dir: labels_path = "/".join([temp_dir, "labels.h5ad"]) - success, errors, is_seurat_convertible = validate(h5ad_invalid, labels_path) + success, errors = validate(h5ad_invalid, labels_path) assert not success assert errors - assert is_seurat_convertible assert not os.path.exists(labels_path) def test__validate_with_h5ad_invalid_and_without_labels(self): - success, errors, is_seurat_convertible = validate(h5ad_invalid) + success, errors = validate(h5ad_invalid) assert not success assert errors - assert is_seurat_convertible class TestCheckSpatial: @@ -1001,56 +995,6 @@ def test__validate_cell_type_ontology_term_id_error(self, cell_type_ontology_ter ) -class TestSeuratConvertibility: - def validation_helper(self, matrix, raw=None): - data = anndata.AnnData(X=matrix, obs=good_obs, uns=good_uns, obsm=good_obsm, var=good_var) - if raw: - data.raw = raw - self.validator: Validator = Validator() - self.validator._set_schema_def() - self.validator.schema_def["max_size_for_seurat"] = 2**3 - 1 # Reduce size required to fail (faster tests) - self.validator.adata = data - - def test_determine_seurat_convertibility(self): - # Sparse matrix with too many nonzero values is not Seurat-convertible - sparse_matrix_too_large = sparse.csr_matrix(np.ones((good_obs.shape[0], good_var.shape[0]), dtype=np.float32)) - self.validation_helper(sparse_matrix_too_large) - self.validator._validate_seurat_convertibility() - assert len(self.validator.warnings) == 1 - assert not self.validator.is_seurat_convertible - - # Reducing nonzero count by 1, to within limit, makes it Seurat-convertible - sparse_matrix_with_zero = sparse.csr_matrix(np.ones((good_obs.shape[0], good_var.shape[0]), dtype=np.float32)) - sparse_matrix_with_zero[0, 0] = 0 - self.validation_helper(sparse_matrix_with_zero) - self.validator._validate_seurat_convertibility() - assert len(self.validator.warnings) == 0 - assert self.validator.is_seurat_convertible - - # Dense matrices with a dimension that exceeds limit will fail -- zeros are irrelevant - dense_matrix_with_zero = np.zeros((good_obs.shape[0], good_var.shape[0]), dtype=np.float32) - self.validation_helper(dense_matrix_with_zero) - self.validator.schema_def["max_size_for_seurat"] = 2**2 - 1 - self.validator._validate_seurat_convertibility() - assert len(self.validator.warnings) == 1 - assert not self.validator.is_seurat_convertible - - # Dense matrices with dimensions in bounds but total count over will succeed - dense_matrix = np.ones((good_obs.shape[0], good_var.shape[0]), dtype=np.float32) - self.validation_helper(dense_matrix) - self.validator.schema_def["max_size_for_seurat"] = 2**3 - 1 - self.validator._validate_seurat_convertibility() - assert len(self.validator.warnings) == 0 - assert self.validator.is_seurat_convertible - - # Visium datasets are not Seurat-convertible - self.validation_helper(sparse_matrix_with_zero) - self.validator.adata.obs = adata_visium.obs.copy() - self.validator._validate_seurat_convertibility() - assert len(self.validator.warnings) == 1 - assert not self.validator.is_seurat_convertible - - class TestValidatorValidateDataFrame: @pytest.mark.parametrize("_type", [np.int64, np.int32, int, np.float64, np.float32, float, str]) def test_succeed_categorical_types(self, tmp_path, _type, valid_adata): From dd97d55250a2bad3b57e179dccbfa79d3cc51651 Mon Sep 17 00:00:00 2001 From: Joyce Yan <5653616+joyceyan@users.noreply.github.com> Date: Mon, 18 Nov 2024 16:26:44 -0800 Subject: [PATCH 05/28] chore: bump cog version for schema 5.3.0 (#1116) --- cellxgene_schema_cli/requirements.txt | 2 +- scripts/schema_bump_dry_run_ontologies/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cellxgene_schema_cli/requirements.txt b/cellxgene_schema_cli/requirements.txt index c2bbdc73..d8b2bcdd 100644 --- a/cellxgene_schema_cli/requirements.txt +++ b/cellxgene_schema_cli/requirements.txt @@ -1,5 +1,5 @@ anndata>=0.8,<0.11 -cellxgene-ontology-guide==1.2.0 # update before a schema migration +cellxgene-ontology-guide==1.3.0 # update before a schema migration click<9 Cython<4 numpy<2 diff --git a/scripts/schema_bump_dry_run_ontologies/requirements.txt b/scripts/schema_bump_dry_run_ontologies/requirements.txt index cb7c918d..373412b0 100644 --- a/scripts/schema_bump_dry_run_ontologies/requirements.txt +++ b/scripts/schema_bump_dry_run_ontologies/requirements.txt @@ -1,2 +1,2 @@ requests<3 -cellxgene-ontology-guide==1.2.0 +cellxgene-ontology-guide==1.3.0 From 48cb1754fe01f38cf8c79cbc979d877a1998f3d0 Mon Sep 17 00:00:00 2001 From: Brian Raymor Date: Tue, 19 Nov 2024 11:07:18 -0800 Subject: [PATCH 06/28] Added genetic ancestry (#1117) --- schema/drafts/5.3.0.md | 177 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/schema/drafts/5.3.0.md b/schema/drafts/5.3.0.md index cbbdb46a..a65f74a8 100644 --- a/schema/drafts/5.3.0.md +++ b/schema/drafts/5.3.0.md @@ -559,6 +559,177 @@ Curators MUST annotate the following columns in the `obs` dataframe:
    +If organism_ontolology_term_id is "NCBITaxon:9606" for Homo sapiens, then for each observation for the following fields, either all their values must be float("nan") or the sum of their values MUST be1.0: + +* genetic_ancestry_African +* genetic_ancestry_East_Asian +* genetic_ancestry_European +* genetic_ancestry_Indigenous_American +* genetic_ancestry_Oceanian +* genetic_ancestry_South_Asian + +### genetic_ancestry_African + + + + + + + + + + + + + + + + +
    Keygenetic_ancestry_African
    AnnotatorCurator MUST annotate.
    Value + str or float. All observations with the same donor_id MUST contain the same value.

    + If organism_ontolology_term_id is NOT + "NCBITaxon:9606" for Homo sapiens, then the + value MUST be "na".

    If + organism_ontolology_term_id is + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0010" for African expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 +
    +
    + +### genetic_ancestry_East_Asian + + + + + + + + + + + + + + + + +
    Keygenetic_ancestry_East_Asian
    AnnotatorCurator MUST annotate.
    Value + str or float. All observations with the same donor_id MUST contain the same value.

    + If organism_ontolology_term_id is NOT + "NCBITaxon:9606" for Homo sapiens, then the + value MUST be "na".

    If + organism_ontolology_term_id is + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0009" for East Asian expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 +
    +
    + +### genetic_ancestry_European + + + + + + + + + + + + + + + + +
    Keygenetic_ancestry_European
    AnnotatorCurator MUST annotate.
    Value + str or float. All observations with the same donor_id MUST contain the same value.

    + If organism_ontolology_term_id is NOT + "NCBITaxon:9606" for Homo sapiens, then the + value MUST be "na".

    If + organism_ontolology_term_id is + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0005" for European expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 +
    +
    + +### genetic_ancestry_Indigenous_American + + + + + + + + + + + + + + + + +
    Keygenetic_ancestry_Indigenous_American
    AnnotatorCurator MUST annotate.
    Value + str or float. All observations with the same donor_id MUST contain the same value.

    + If organism_ontolology_term_id is NOT + "NCBITaxon:9606" for Homo sapiens, then the + value MUST be "na".

    If + organism_ontolology_term_id is + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0013" for Indigenous American expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 +
    +
    + +### genetic_ancestry_Oceanian + + + + + + + + + + + + + + + + +
    Keygenetic_ancestry_Oceanian
    AnnotatorCurator MUST annotate.
    Value + str or float. All observations with the same donor_id MUST contain the same value.

    + If organism_ontolology_term_id is NOT + "NCBITaxon:9606" for Homo sapiens, then the + value MUST be "na".

    If + organism_ontolology_term_id is + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0017" for Oceanian expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 +
    +
    + +### genetic_ancestry_South_Asian + + + + + + + + + + + + + + + + +
    Keygenetic_ancestry_South_Asian
    AnnotatorCurator MUST annotate.
    Value + str or float. All observations with the same donor_id MUST contain the same value.

    + If organism_ontolology_term_id is NOT + "NCBITaxon:9606" for Homo sapiens, then the + value MUST be "na".

    If + organism_ontolology_term_id is + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0006" for South Asian expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 +
    +
    + ### in_tissue @@ -1904,6 +2075,12 @@ When a dataset is uploaded, CELLxGENE Discover MUST automatically add the `schem * Added ranges for _Visium CytAssist Spatial Gene Expression, 6.5mm_ and _Visium CytAssist Spatial Gene Expression, 11mm_ * Updated the requirements for `assay_ontology_term_id` to include descendants of _Visium Spatial Gene Expression_. All observations must contain the same value. Also updated recommended values for assays. * Updated the requirements for `cell_type_ontology_term_id` to include descendants of _Visium Spatial Gene Expression_. + * Added genetic_ancestry_African + * Added genetic_ancestry_East_Asian + * Added genetic_ancestry_European + * Added genetic_ancestry_Indigenous_American + * Added genetic_ancestry_Oceanian + * Added genetic_ancestry_South_Asian * Updated the requirements for `in_tissue` to include descendants of _Visium Spatial Gene Expression_. * obsm (Embeddings) * Updated the requirements for `spatial` to include descendants of _Visium Spatial Gene Expression_ and to prohibit 'Not a Number' values. From bb662b5a4227722e23725a3b849afbc6007261ea Mon Sep 17 00:00:00 2001 From: Trent Smith <1429913+Bento007@users.noreply.github.com> Date: Tue, 19 Nov 2024 12:03:13 -0800 Subject: [PATCH 07/28] release: 5.2.2 (#1118) --- .bumpversion.cfg | 2 +- cellxgene_schema_cli/cellxgene_schema/__init__.py | 2 +- cellxgene_schema_cli/setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 13a12d63..87d0d865 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 5.2.1 +current_version = 5.2.2 commit = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?Prc)\.(?P\d+))? serialize = diff --git a/cellxgene_schema_cli/cellxgene_schema/__init__.py b/cellxgene_schema_cli/cellxgene_schema/__init__.py index 98886d26..15cf1350 100644 --- a/cellxgene_schema_cli/cellxgene_schema/__init__.py +++ b/cellxgene_schema_cli/cellxgene_schema/__init__.py @@ -1 +1 @@ -__version__ = "5.2.1" +__version__ = "5.2.2" diff --git a/cellxgene_schema_cli/setup.py b/cellxgene_schema_cli/setup.py index ce67b058..db4ff43c 100644 --- a/cellxgene_schema_cli/setup.py +++ b/cellxgene_schema_cli/setup.py @@ -5,7 +5,7 @@ setup( name="cellxgene-schema", - version="5.2.1", + version="5.2.2", url="https://github.com/chanzuckerberg/single-cell-curation", license="MIT", author="Chan Zuckerberg Initiative", From 34e76db0419d9c6961e1ba767f3cdbd8aa67102a Mon Sep 17 00:00:00 2001 From: Joyce Yan <5653616+joyceyan@users.noreply.github.com> Date: Tue, 19 Nov 2024 15:47:14 -0800 Subject: [PATCH 08/28] feat: update "is visium" definition for cell_type_ontology_term_id (#1115) --- .../cellxgene_schema/validate.py | 39 ++++++++++-- cellxgene_schema_cli/tests/test_validate.py | 60 ++++++++++++++++--- 2 files changed, 88 insertions(+), 11 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 6869a9df..98ce9e7d 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -21,7 +21,7 @@ logger = logging.getLogger(__name__) -ONTOLOGY_PARSER = OntologyParser(schema_version=f"v{schema.get_current_schema_version()}") +ONTOLOGY_PARSER = OntologyParser(schema_version="v5.3.0") ASSAY_VISIUM = "EFO:0010961" ASSAY_SLIDE_SEQV2 = "EFO:0030062" @@ -29,7 +29,7 @@ VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 4992 SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE = 2000 -ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE = "obs['assay_ontology_term_id'] 'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True" +ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE = "descendants of obs['assay_ontology_term_id'] 'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True" ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_FORBIDDEN = f"is only allowed for {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}" ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED = f"is required for {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}" ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_IN_TISSUE_0 = f"{ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE} and in_tissue is 0" @@ -1475,12 +1475,12 @@ def _validate_spatial_cell_type_ontology_term_id(self): # Exit if: # - not Visium and is_single is True as no further checks are necessary # - in_tissue is not specified as checks are dependent on this value - if not self._is_visium_and_is_single_true() or "in_tissue" not in self.adata.obs: + if not self._is_visium_including_descendants() and self._is_single() or "in_tissue" not in self.adata.obs: return # Validate cell type: must be "unknown" if Visium and is_single is True and in_tissue is 0. if ( - (self.adata.obs["assay_ontology_term_id"] == ASSAY_VISIUM) + self._is_visium_including_descendants() & (self.adata.obs["in_tissue"] == 0) & (self.adata.obs["cell_type_ontology_term_id"] != "unknown") ).any(): @@ -1760,6 +1760,37 @@ def _is_visium(self) -> bool: self.is_visium = assay_ontology_term_id is not None and (assay_ontology_term_id == ASSAY_VISIUM).any() return self.is_visium + def _is_visium_including_descendants(self) -> bool: + """ + Determine if the assay_ontology_term_id is Visium (descendant of EFO:0010961). + + :return True if assay_ontology_term_id is Visium, False otherwise. + :rtype bool + """ + if self.is_visium is None: + assay_ontology_term_id = self.adata.obs.get("assay_ontology_term_id") + + if assay_ontology_term_id is not None: + # Convert to a regular Series if it's Categorical + assay_ontology_term_id = pd.Series(assay_ontology_term_id) + + # Check if any term is a descendant of ASSAY_VISIUM + try: + visium_results = assay_ontology_term_id.apply( + lambda term: ASSAY_VISIUM + in list(ONTOLOGY_PARSER.get_lowest_common_ancestors(ASSAY_VISIUM, term)) + ) + self.is_visium = visium_results.astype(bool).any() + except KeyError as e: + # This generally means the assay_ontology_term_id is invalid, but we want the error to be raised + # by our explicit validator checks, not this implicit one. + logger.warning(f"KeyError processing assay_ontology_term_id ontology: {e}") + self.is_visium = False + else: + self.is_visium = False + + return self.is_visium + def _validate_spatial_image_shape(self, image_name: str, image: np.ndarray, max_dimension: int = None): """ Validate the spatial image is of shape (,,3 or 4) and has a max dimension, if specified. A spatial image diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 819cde43..801bc7cc 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -333,6 +333,31 @@ def test__validate_with_h5ad_invalid_and_without_labels(self): class TestCheckSpatial: + @pytest.mark.parametrize( + "assay_ontology_term_id, expected_is_visium", + [ + # Parent term for Visium Spatial Gene Expression. This term and all its descendants are Visium + ("EFO:0010961", True), + # Visium Spatial Gene Expression V1 + ("EFO:0022857", True), + # Visium CytAssist Spatial Gene Expression V2 + ("EFO:0022858", True), + # Visium CytAssist Spatial Gene Expression, 11mm + ("EFO:0022860", True), + # Visium CytAssist Spatial Gene Expression, 6.5mm + ("EFO:0022859", True), + # Random other EFO term + ("EFO:0003740", False), + ], + ) + def test__is_visium_descendant(self, assay_ontology_term_id, expected_is_visium): + validator: Validator = Validator() + validator._set_schema_def() + validator.adata = adata_visium.copy() + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + + assert validator._is_visium_including_descendants() == expected_is_visium + def test__validate_spatial_visium_ok(self): validator: Validator = Validator() validator._set_schema_def() @@ -958,33 +983,54 @@ def test__validate_tissue_position_int_max_error(self, tissue_position_name, max assert f"obs['{tissue_position_name}'] must be {error_message_token}" in validator.errors[0] @pytest.mark.parametrize( - "cell_type_ontology_term_id, in_tissue", - [("unknown", 0), (["unknown", "CL:0000066"], [0, 1]), ("CL:0000066", 1)], + "cell_type_ontology_term_id, in_tissue, assay_ontology_term_id", + [ + # MUST be unknown when in_tissue = 0 and assay_ontology_term_id = Visium Spatial Gene Expression + ("unknown", 0, "EFO:0010961"), + # MUST be unknown when in_tissue = 0 and assay_ontology_term_id = Visium CytAssist Spatial Gene Expression, 11mm + ("unknown", 0, "EFO:0022860"), + # MUST be unknown when in_tissue = 0 and assay_ontology_term_id = Visium Spatial Gene Expression V1 + # valid CL term is ok when in_tissue = 1 and assay_ontology_term_id = Visium CytAssist Spatial Gene Expression, 11mm + (["unknown", "CL:0000066"], [0, 1], ["EFO:0022857", "EFO:0022860"]), + # normal CL term for in_tissue = 1 and assay_ontology_term_id = 10x 3' v2 + ("CL:0000066", 1, "EFO:0009899"), + ], ) - def test__validate_cell_type_ontology_term_id_ok(self, cell_type_ontology_term_id, in_tissue): + def test__validate_cell_type_ontology_term_id_ok( + self, cell_type_ontology_term_id, in_tissue, assay_ontology_term_id + ): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() validator.adata.obs.cell_type_ontology_term_id = cell_type_ontology_term_id validator.adata.obs.in_tissue = in_tissue + validator.adata.obs.assay_ontology_term_id = assay_ontology_term_id # Confirm cell type is valid. validator._validate_spatial_cell_type_ontology_term_id() assert not validator.errors @pytest.mark.parametrize( - "cell_type_ontology_term_id, in_tissue", + "cell_type_ontology_term_id, in_tissue, assay_ontology_term_id", [ - ("CL:0000066", 0), - (["CL:0000066", "unknown"], [0, 1]), + # MUST be unknown when in_tissue = 0 and assay_ontology_term_id = Visium Spatial Gene Expression + ("CL:0000066", 0, "EFO:0010961"), + (["CL:0000066", "unknown"], [0, 1], ["EFO:0010961", "EFO:0010961"]), + # MUST be unknown when in_tissue = 0 and assay_ontology_term_id = Visium CytAssist Spatial Gene Expression, 11mm + ("CL:0000066", 0, "EFO:0022860"), + # MUST be unknown when in_tissue = 0 and assay_ontology_term_id = Visium Spatial Gene Expression V1 + ("CL:0000066", 0, "EFO:0022857"), ], ) - def test__validate_cell_type_ontology_term_id_error(self, cell_type_ontology_term_id, in_tissue): + def test__validate_cell_type_ontology_term_id_error( + self, cell_type_ontology_term_id, in_tissue, assay_ontology_term_id + ): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() validator.adata.obs.cell_type_ontology_term_id = cell_type_ontology_term_id validator.adata.obs.in_tissue = in_tissue + validator.adata.obs.assay_ontology_term_id = assay_ontology_term_id # Confirm errors. validator._validate_spatial_cell_type_ontology_term_id() From 5e0a05dc7274ccf7655d6e5db66bcf53c489cc0d Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Thu, 21 Nov 2024 10:10:12 -0500 Subject: [PATCH 09/28] chore: forbid any NaN in spatial embeddings # (#1119) Co-authored-by: Evan Molinelli --- .../cellxgene_schema/validate.py | 9 +++++++-- .../tests/test_schema_compliance.py | 20 ++++++++++++++----- cellxgene_schema_cli/tests/test_validate.py | 12 +++++++++++ pyproject.toml | 5 +++++ 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 98ce9e7d..084e9769 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -944,6 +944,7 @@ def _validate_obsm(self): issue_list = self.errors regex_pattern = r"^[a-zA-Z][a-zA-Z0-9_.-]*$" + key_is_spatial = key.lower() == "spatial" unknown_key = False # an unknown key does not match 'spatial' or 'X_{suffix}' if key.startswith("X_"): @@ -954,7 +955,7 @@ def _validate_obsm(self): self.errors.append( f"Suffix for embedding key in 'adata.obsm' {key} does not match the regex pattern {regex_pattern}." ) - elif key.lower() != "spatial": + elif not key_is_spatial: if not re.match(regex_pattern, key): self.errors.append( f"Embedding key in 'adata.obsm' {key} does not match the regex pattern {regex_pattern}." @@ -1002,7 +1003,11 @@ def _validate_obsm(self): # Check for inf/NaN values only if the dtype is numeric if np.isinf(value).any(): issue_list.append(f"adata.obsm['{key}'] contains positive infinity or negative infinity values.") - if np.all(np.isnan(value)): + + # spatial embeddings can't have any NaN; other embeddings can't be all NaNs + if key_is_spatial and np.any(np.isnan(value)): + issue_list.append("adata.obs['spatial] contains at least one NaN value.") + elif np.all(np.isnan(value)): issue_list.append(f"adata.obsm['{key}'] contains all NaN values.") if self._is_supported_spatial_assay() is False and obsm_with_x_prefix == 0: diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 0d3d5d4b..3646bc43 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -2141,20 +2141,30 @@ def test_obsm_values_str(self, validator_with_visium_assay, key): @pytest.mark.parametrize("key", ["X_umap", "spatial"]) def test_obsm_values_nan(self, validator_with_visium_assay, key): """ - values in obsm cannot all be NaN + test obsm NaN restrictions for different embedding types. + feature embeddings: X_* cannot be all NaN + spatial emeddings: 'spatial' cannot have any NaNs """ validator = validator_with_visium_assay obsm = validator.adata.obsm - # It's okay if only one value is NaN + + # Check embedding has any NaN obsm[key][0:100, 1] = numpy.nan validator.validate_adata() - assert validator.errors == [] - # It's not okay if all values are NaN + if key != "spatial": + assert validator.errors == [] + else: + assert validator.errors == ["ERROR: adata.obs['spatial] contains at least one NaN value."] + + # Check embedding has all NaNs all_nan = numpy.full(obsm[key].shape, numpy.nan) obsm[key] = all_nan validator.validate_adata() - assert validator.errors == [f"ERROR: adata.obsm['{key}'] contains all NaN values."] + if key != "spatial": + assert validator.errors == [f"ERROR: adata.obsm['{key}'] contains all NaN values."] + else: + assert validator.errors == ["ERROR: adata.obs['spatial] contains at least one NaN value."] def test_obsm_values_no_X_embedding__non_spatial_dataset(self, validator_with_adata): validator = validator_with_adata diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 801bc7cc..accc6b86 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -1040,6 +1040,18 @@ def test__validate_cell_type_ontology_term_id_error( in validator.errors[0] ) + def test__validate_embeddings_non_nans(self): + validator: Validator = Validator() + validator._set_schema_def() + validator.adata = adata_visium.copy() + validator.visium_and_is_single_true_matrix_size = 2 + + # invalidate spatial embeddings with NaN value + validator.adata.obsm["spatial"][0, 1] = np.nan + # Confirm spatial is valid. + validator.validate_adata() + assert validator.errors == ["ERROR: adata.obs['spatial] contains at least one NaN value."] + class TestValidatorValidateDataFrame: @pytest.mark.parametrize("_type", [np.int64, np.int32, int, np.float64, np.float32, float, str]) diff --git a/pyproject.toml b/pyproject.toml index 1dd902b4..01f9c970 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,3 +41,8 @@ show_error_codes = true ignore_missing_imports = true warn_unreachable = true warn_unused_configs = true + +[tool.pytest.ini_options] +pythonpath = [ + "cellxgene_schema_cli" +] \ No newline at end of file From 0392f2b0c25bd78884abbb455eee623325bd37a3 Mon Sep 17 00:00:00 2001 From: Brian Raymor Date: Fri, 22 Nov 2024 14:43:39 -0800 Subject: [PATCH 10/28] Added c. elegans (#1126) --- schema/drafts/5.2.1-experimental.md | 210 ++++++++++++++++++++-------- 1 file changed, 151 insertions(+), 59 deletions(-) diff --git a/schema/drafts/5.2.1-experimental.md b/schema/drafts/5.2.1-experimental.md index 4275dfe6..0190c884 100644 --- a/schema/drafts/5.2.1-experimental.md +++ b/schema/drafts/5.2.1-experimental.md @@ -8,7 +8,7 @@ Version: 5.2.1-experimental The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED" "MAY", and "OPTIONAL" in this document are to be interpreted as described in [BCP 14](https://tools.ietf.org/html/bcp14), [RFC2119](https://www.rfc-editor.org/rfc/rfc2119.txt), and [RFC8174](https://www.rfc-editor.org/rfc/rfc8174.txt) when, and only when, they appear in all capitals, as shown here. -This draft is limited to **additions** or **modifications** to [schema 5.2.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md). If a 5.2.0 reference does not appear in this document, then no schema change is required. The following **temporary** constraints for *Danio rerio* and *Drosophila melanogaster* are specified: +This draft is limited to **additions** or **modifications** to [schema 5.2.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md). If a 5.2.0 reference does not appear in this document, then no schema change is required. The following **temporary** constraints are specified: * The `organism_ontology_term_id` MUST be the same for all observations. * The `tissue_type` MUST be `'tissue'` for all observations. @@ -24,6 +24,8 @@ The following ontology dependencies are *pinned* for this version of the schema. | Ontology | OBO Prefix | Release | Download | |:--|:--|:--|:--| +| [C. elegans Development Ontology] | WBls | [ 2024-09-26 Wormbase WS295](https://github.com/obophenotype/c-elegans-development-ontology/blob/vWS295) | [wbls.owl] | +| [C. elegans Gross Anatomy Ontology] | WBbt | [2024-09-24 Wormbase WS295](https://github.com/obophenotype/c-elegans-gross-anatomy-ontology/blob/v2024-09-24) | [wbbt.owl] | | [Cell Ontology] | CL | [2024-08-16] | [cl.owl]| | [Drosophila Anatomy Ontology] | FBbt | [2024-08-08](https://github.com/FlyBase/drosophila-anatomy-developmental-ontology/releases/tag/v2024-08-08) | [fbbt.owl] | | [Drosophila Development Ontology] | FBdv | [2024-08-07](https://github.com/FlyBase/drosophila-developmental-ontology/releases/tag/v2024-08-07) | [fbdv.owl] | @@ -38,6 +40,11 @@ The following ontology dependencies are *pinned* for this version of the schema. | [Zebrafish Anatomy Ontology] | ZFA
    ZFS | [2022-12-09] | [zfa.owl] | | | | | | +[C. elegans Development Ontology]: https://obofoundry.org/ontology/wbls.html +[wbls.owl]: https://github.com/obophenotype/c-elegans-development-ontology/blob/vWS295/wbls.owl +[C. elegans Gross Anatomy Ontology]: https://obofoundry.org/ontology/wbbt.html + +[wbbt.owl]: https://github.com/obophenotype/c-elegans-gross-anatomy-ontology/blob/v2024-09-24/wbbt.owl [Cell Ontology]: http://obofoundry.org/ontology/cl.html [2024-08-16]: https://github.com/obophenotype/cell-ontology/releases/tag/v2024-08-16 [cl.owl]: https://github.com/obophenotype/cell-ontology/releases/download/v2024-08-16/cl.owl @@ -97,8 +104,9 @@ The following gene annotation dependencies are *pinned* for this version of the | NCBITaxon:9606
    for Homo sapiens | [GENCODE (Human)] | Human reference GRCh38.p14
    (GENCODE v44/Ensembl 110) | [gencode.v44.primary_assembly.annotation.gtf] | | NCBITaxon:10090
    for Mus musculus | [GENCODE (Mouse)] | Mouse reference GRCm39
    (GENCODE vM33/Ensembl 110) | [gencode.vM33.primary_assembly.annotation.gtf] | | NCBITaxon:2697049
    for SARS-CoV-2 | [ENSEMBL (COVID-19)] | SARS-CoV-2 reference (ENSEMBL assembly: ASM985889v3) | [Sars\_cov\_2.ASM985889v3.101.gtf] | -| NCBITaxon:7955
    for Danio rerio | [ENSEMBL (Zebrafish)] | GRCz11.112 (Ensembl 112) | [Danio_rerio.GRCz11.112.gtf] | -| "NCBITaxon:7227"
    for Drosophila melanogaster| [ENSEMBL (Fruit fly)] | BDGP6.46 (Ensembl 112) | [Drosophila_melanogaster.BDGP6.46.112.gtf] | +| "NCBITaxon:6239"
    for Caenorhabditis elegans | [ENSEMBL (Caenorhabditis elegans)] | WBcel235 (GCA_000002985.3)
    Ensembl 113 | [Caenorhabditis_elegans.WBcel235.113.gtf] | +| NCBITaxon:7955
    for Danio rerio | [ENSEMBL (Zebrafish)] | GRCz11 (GCA_000002035.4)
    Ensembl 113 | [Danio_rerio.GRCz11.113.gtf] | +| "NCBITaxon:7227"
    for Drosophila melanogaster| [ENSEMBL (Fruit fly)] | BDGP6.46 (GCA_000001215.4)
    Ensembl 113 | [Drosophila_melanogaster.BDGP6.46.113.gtf] | | | [ThermoFisher ERCC Spike-Ins] | ThermoFisher ERCC RNA Spike-In Control Mixes (Cat # 4456740, 4456739) | [cms_095047.txt] | [RNA Spike-In Control Mixes]: https://www.thermofisher.com/document-connect/document-connect.html?url=https%3A%2F%2Fassets.thermofisher.com%2FTFS-Assets%2FLSG%2Fmanuals%2Fcms_086340.pdf&title=VXNlciBHdWlkZTogRVJDQyBSTkEgU3Bpa2UtSW4gQ29udHJvbCBNaXhlcyAoRW5nbGlzaCAp @@ -112,11 +120,14 @@ The following gene annotation dependencies are *pinned* for this version of the [ENSEMBL (COVID-19)]: https://covid-19.ensembl.org/index.html [Sars\_cov\_2.ASM985889v3.101.gtf]: https://ftp.ensemblgenomes.org/pub/viruses/gtf/sars_cov_2/Sars_cov_2.ASM985889v3.101.gtf.gz +[ENSEMBL (Caenorhabditis elegans)]: https://useast.ensembl.org/Caenorhabditis_elegans/Info/Index +[Caenorhabditis_elegans.WBcel235.113.gtf]: https://ftp.ensembl.org/pub/release-113/gtf/caenorhabditis_elegans/Caenorhabditis_elegans.WBcel235.113.gtf.gz + [ENSEMBL (Zebrafish)]: https://useast.ensembl.org/Danio_rerio/Info/Index -[Danio_rerio.GRCz11.112.gtf]: https://ftp.ensembl.org/pub/release-112/gtf/danio_rerio/Danio_rerio.GRCz11.112.gtf.gz +[Danio_rerio.GRCz11.113.gtf]: https://ftp.ensembl.org/pub/release-113/gtf/danio_rerio/Danio_rerio.GRCz11.113.gtf.gz [ENSEMBL (Fruit fly)]: https://www.ensembl.org/Drosophila_melanogaster/Info/Index -[Drosophila_melanogaster.BDGP6.46.112.gtf]: https://ftp.ensembl.org/pub/release-112/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.112.gtf.gz +[Drosophila_melanogaster.BDGP6.46.113.gtf]: https://ftp.ensembl.org/pub/release-113/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.113.gtf.gz [ThermoFisher ERCC Spike-Ins]: https://www.thermofisher.com/order/catalog/product/4456740#/4456740 [cms_095047.txt]: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095047.txt @@ -128,27 +139,57 @@ The following gene annotation dependencies are *pinned* for this version of the ### development_stage_ontology_term_id
    - - - - - - - - - - - + + + + + + + + + + +
    Keydevelopment_stage_ontology_term_id
    AnnotatorCurator MUST annotate.
    Value - categorical with str categories. If unavailable, this MUST be "unknown".

    - If organism_ontolology_term_id is "NCBITaxon:7955" for Danio rerio, then this MUST be the most accurate descendant of ZFS:0100000 for zebrafish stage and MUST NOT be ZFS:0000000 for Unknown.

    If organism_ontolology_term_id is "NCBITaxon:7227" for Drosophila melanogaster, then this MUST be the most accurate FBdv term. -

    Otherwise, for all other organisms this MUST be the most accurate descendant of UBERON:0000105 for life cycle stage, excluding UBERON:0000071 for death stage. -
    Keydevelopment_stage_ontology_term_id
    AnnotatorCurator MUST annotate.
    Value + categorical with str categories. If unavailable, this MUST be "unknown".

    + + + + + + + + + + + + + + + + + + + + + +
    For organism_ontolology_term_idValue
    + "NCBITaxon:6239"
    for Caenorhabditis elegans +
    + MUST be the most accurate descendant of WBls:0000075
    for worm life stage +
    + "NCBITaxon:7955"
    for Danio rerio +
    + MUST be the most accurate descendant of ZFS:0100000
    for zebrafish stage and MUST NOT be ZFS:0000000 for Unknown +
    + "NCBITaxon:7227"
    for Drosophila melanogaster +
    + MUST be the most accurate FBdv term +
    +

    ---- - ### organism_cell_type_ontology_term_id @@ -163,7 +204,15 @@ The following gene annotation dependencies are *pinned* for this version of the
    Value - categorical with str categories.

    + categorical with str categories. This MUST be "unknown" when: +
      +
    • + no appropriate term can be found (e.g. the cell type is unknown) +
    • +
    • + assay_ontology_term_id is "EFO:0010961" for Visium Spatial Gene Expression, uns['spatial']['is_single'] is True, and the corresponding value of in_tissue is 0 +
    • +
    @@ -172,40 +221,27 @@ The following gene annotation dependencies are *pinned* for this version of the + - + -
    For organism_ontolology_term_id
    - "NCBITaxon:7955"
    for Danio rerio + "NCBITaxon:6239"
    for Caenorhabditis elegans
    - MUST be either the most accurate descendant of ZFA:0009000 for cell
    or "unknown" when: -
      -
    • - no appropriate term can be found (e.g. the cell type is unknown) -
    • -
    • - assay_ontology_term_id is "EFO:0010961" for
      Visium Spatial Gene Expression, uns['spatial']['is_single'] is True,
      and the corresponding value of in_tissue is 0 -
    • -
    + MUST be the most accurate descendant of WBbt:0004017 for Cell
    - "NCBITaxon:7227"
    for Drosophila melanogaster + "NCBITaxon:7955"
    for Danio rerio
    MUST be either the most accurate descendant of FBbt:00007002 for cell
    or "unknown" when: -
      -
    • - no appropriate term can be found (e.g. the cell type is unknown) -
    • -
    • - assay_ontology_term_id is "EFO:0010961" for
      Visium Spatial Gene Expression, uns['spatial']['is_single'] is True,
      and the corresponding value of in_tissue is 0 -
    • -
    +
    + MUST be the most accurate descendant of ZFA:0009000 for cell
    - All other values of
    organism_ontology_term_id + "NCBITaxon:7227"
    for Drosophila melanogaster +
    MUST be the most accurate descendant of FBbt:00007002 for cell MUST be "na"
    @@ -230,7 +266,12 @@ The following gene annotation dependencies are *pinned* for this version of the
    Value - categorical with str categories. This MUST be a descendant of NCBITaxon:33208 for Metazoa.

    If organism_ontology_term_id is "NCBITaxon:7955" for Danio rerio or "NCBITaxon:7227" for Drosophila melanogaster, then all observations MUST contain the same value. + categorical with str categories. This MUST be a descendant of NCBITaxon:33208 for Metazoa.

    All observations MUST contain the same value when the organism_ontology_term_id is: +
    @@ -261,6 +302,14 @@ The following gene annotation dependencies are *pinned* for this version of the + + + "NCBITaxon:6239"
    for Caenorhabditis elegans + + + MUST be the most accurate descendant of WBbt:0005766 for Anatomy + + "NCBITaxon:7955"
    for Danio rerio @@ -277,12 +326,6 @@ The following gene annotation dependencies are *pinned* for this version of the MUST be the most accurate descendant of FBbt:10000000 for
    anatomical entity and MUST NOT be FBbt:00007002
    for cell or any of its descendants. - - - All other values of
    organism_ontology_term_id - - MUST be "na" - @@ -292,6 +335,27 @@ The following gene annotation dependencies are *pinned* for this version of the --- +### sex_ontology_term_id + + + + + + + + + + + + + + +
    Keysex_ontology_term_id
    AnnotatorCurator MUST annotate.
    Valuecategorical with str categories. If unavailable, this MUST be "unknown".

    If organism_ontolology_term_id is "NCBITaxon:6239" for Caenorhabditis elegans, this MUST be PATO:0000384 for male or PATO:0001340 for hermaphrodite.

    Otherwise, this MUST be a descendant of PATO:0001894 for phenotypic sex. +
    +
    + +--- + ### tissue_type @@ -306,12 +370,18 @@ The following gene annotation dependencies are *pinned* for this version of the
    Value - categorical with str categories.

    If organism_ontology_term_id is "NCBITaxon:7955" for Danio rerio or "NCBITaxon:7227" for Drosophila melanogaster, then the value MUST be "tissue".

    Otherwise, the value MUST be "tissue", "organoid", or "cell culture". + categorical with str categories.

    The value MUST be "tissue" when the organism_ontology_term_id is: + Otherwise, the value MUST be "tissue", "organoid", or "cell culture".

    + --- ## var and raw.var (Gene Metadata) @@ -355,6 +425,12 @@ The following gene annotation dependencies are *pinned* for this version of the "NCBITaxon:2697049" + + Caenorhabditis elegans + + "NCBITaxon:6293" + + Danio rerio @@ -388,18 +464,34 @@ The following gene annotation dependencies are *pinned* for this version of the * General Requirements * Updated requirements for supported organisms * Required Ontologies + * Added C. elegans Development Ontology (WBls) release 2024-09-26 Wormbase WS295 + * Added C. elegans Gross Anatomy Ontology (WBbt) release 2024-09-24 Wormbase WS295 * Added Drosophila Anatomy Ontology (FBbt) release 2024-08-08 * Added Drosophila Development Ontology (FBdv) release 2024-08-07 * Added Zebrafish Anatomy Ontology (ZFA+ZFS) release 2022-12-09 * Required Gene Annotations * Refactored table to include NCBI Taxon for supported organisms - * Added *Danio rerio* Reference GRCz11.112 (Ensembl 112) - * Added *Drosophila melanogaster* Reference BDGP6.46 (Ensembl 112) + * Added *Caenorhabditis elegans* WBcel235 (GCA_000002985.3) Ensembl 113 + * Added *Danio rerio* GRCz11 (GCA_000002035.4) Ensembl 113 + * Added *Drosophila melanogaster* BDGP6.46 (GCA_000001215.4) Ensembl 113 * obs (Cell metadata) - * Updated `development_stage_ontology_term_id` for *Danio rerio* and *Drosophila melanogaster* + * Updated `development_stage_ontology_term_id` to include: + * *Caenorhabditis elegans* + * *Danio rerio* + * *Drosophila melanogaster* * Added `organism_cell_type_ontology_term_id` - * Updated `organism_ontology_term_id` for *Danio rerio* and *Drosophila melanogaster* to require all observations to contain the same value + * Updated `organism_ontology_term_id` to require all observations to contain the same value for: + * *Caenorhabditis elegans* + * *Danio rerio* + * *Drosophila melanogaster* * Added `organism_tissue_ontology_term_id` - * Updated `tissue_type` to require `"tissue"` for *Danio rerio* and *Drosophila melanogaster* + * Updated `sex_ontology_term_id` for *Caenorhabditis elegans* + * Updated `tissue_type` to require `"tissue"` for: + * *Caenorhabditis elegans* + * *Danio rerio* + * *Drosophila melanogaster* * var and raw.var (Gene Metadata) - * Updated `feature_reference` for *Danio rerio* and *Drosophila melanogaster* \ No newline at end of file + * Updated `feature_reference` to include: + * *Caenorhabditis elegans* + * *Danio rerio* + * *Drosophila melanogaster* \ No newline at end of file From 0c9f9af72e41d25744cf5963f351169c13680c3c Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Mon, 25 Nov 2024 09:50:07 -0500 Subject: [PATCH 11/28] feat: update validation for obs['in_tissue'] to include descendants of Visiium (#1124) Co-authored-by: Evan Molinelli --- .../cellxgene_schema/utils.py | 14 +++ .../cellxgene_schema/validate.py | 88 +++++++++++-------- .../tests/test_schema_compliance.py | 21 +++++ 3 files changed, 86 insertions(+), 37 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/utils.py b/cellxgene_schema_cli/cellxgene_schema/utils.py index fb8f58f4..e2b558f7 100644 --- a/cellxgene_schema_cli/cellxgene_schema/utils.py +++ b/cellxgene_schema_cli/cellxgene_schema/utils.py @@ -2,10 +2,12 @@ import os import sys from base64 import b85encode +from functools import lru_cache from typing import Dict, List, Union import anndata as ad import numpy as np +from cellxgene_ontology_guide.ontology_parser import OntologyParser from scipy import sparse from xxhash import xxh3_64_intdigest @@ -151,3 +153,15 @@ def get_hash_digest_column(dataframe): .astype(np.uint64) .apply(lambda v: b85encode(v.to_bytes(8, "big")).decode("ascii")) ) + + +@lru_cache() +def is_ontological_descendant_of(onto: OntologyParser, term: str, target: str, include_self: bool = True) -> bool: + """ + Determines if :term is an ontological descendant of :target and whether to include :term==:target. + + This function is cached and is safe to call many times. + + #TODO:[EM] needs testing + """ + return term in set(onto.get_term_descendants(target, include_self)) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 084e9769..f7892e6b 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -17,7 +17,7 @@ from scipy import sparse from . import gencode, schema -from .utils import SPARSE_MATRIX_TYPES, get_matrix_format, getattr_anndata, read_h5ad +from .utils import SPARSE_MATRIX_TYPES, get_matrix_format, getattr_anndata, is_ontological_descendant_of, read_h5ad logger = logging.getLogger(__name__) @@ -211,7 +211,7 @@ def _validate_curie_ancestors( is_valid_term_id = ONTOLOGY_PARSER.is_valid_term_id(term_id) is_valid_ancestor_id = ONTOLOGY_PARSER.is_valid_term_id(ancestor) if is_valid_term_id & is_valid_ancestor_id: - is_descendant = ancestor in ONTOLOGY_PARSER.get_term_ancestors(term_id) + is_descendant = ancestor in ONTOLOGY_PARSER.get_term_ancestors(term_id, inclusive) checks.append(is_descendant) if True not in checks: @@ -1477,18 +1477,25 @@ def _validate_spatial_cell_type_ontology_term_id(self): :rtype none """ - # Exit if: - # - not Visium and is_single is True as no further checks are necessary - # - in_tissue is not specified as checks are dependent on this value - if not self._is_visium_including_descendants() and self._is_single() or "in_tissue" not in self.adata.obs: + self._is_visium_including_descendants() + self._is_single() + self._is_visium_and_is_single_true() + + # skip checks if not a valid spatial assay with a corresponding "in_tissue" column + if not self.is_visium_and_is_single_true: + # not a valid spatial assay + return + elif self.is_visium_and_is_single_true and "in_tissue" not in self.adata.obs.columns: + # valid spatial assay, but missing "in_tissue" column return - # Validate cell type: must be "unknown" if Visium and is_single is True and in_tissue is 0. - if ( - self._is_visium_including_descendants() - & (self.adata.obs["in_tissue"] == 0) - & (self.adata.obs["cell_type_ontology_term_id"] != "unknown") - ).any(): + # Validate all out of tissue (in_tissue==0) spatial spots have unknown cell ontology term + is_spatial = self.adata.obs["assay_ontology_term_id"].apply( + lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True) + ) + is_not_tissue = self.adata.obs["in_tissue"] == 0 + is_not_unknown = self.adata.obs["cell_type_ontology_term_id"] != "unknown" + if (is_spatial & is_not_tissue & is_not_unknown).any(): self.errors.append( f"obs['cell_type_ontology_term_id'] must be 'unknown' when {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_IN_TISSUE_0}." ) @@ -1500,11 +1507,21 @@ def _validate_spatial_tissue_position(self, tissue_position_name: str, min: int, :rtype none """ + # check for visium status and then is visium and single + # techdebt: the following lines are order dependent. Violates idempotence. + self._is_visium_including_descendants() + self._is_single() + self._is_visium_and_is_single_true() + # Tissue position is foribidden if assay is not Visium and is_single is True. if tissue_position_name in self.adata.obs and ( - not self._is_visium_and_is_single_true() + not (self.is_visium_and_is_single_true) or ( - ~(self.adata.obs["assay_ontology_term_id"] == ASSAY_VISIUM) + ~( + self.adata.obs["assay_ontology_term_id"].apply( + lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM, True) + ) + ) & (self.adata.obs[tissue_position_name].notnull()) ).any() ): @@ -1521,7 +1538,11 @@ def _validate_spatial_tissue_position(self, tissue_position_name: str, min: int, if ( tissue_position_name not in self.adata.obs or ( - (self.adata.obs["assay_ontology_term_id"] == ASSAY_VISIUM) + ( + self.adata.obs["assay_ontology_term_id"].apply( + lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM, True) + ) + ) & (self.adata.obs[tissue_position_name].isnull()) ).any() ): @@ -1767,34 +1788,27 @@ def _is_visium(self) -> bool: def _is_visium_including_descendants(self) -> bool: """ - Determine if the assay_ontology_term_id is Visium (descendant of EFO:0010961). + Determine if the assay_ontology_term_id is Visium (inclusive descendant of EFO:0010961). + Returns True if ANY assay_ontology_term_id is a Visium descendant :return True if assay_ontology_term_id is Visium, False otherwise. :rtype bool """ - if self.is_visium is None: - assay_ontology_term_id = self.adata.obs.get("assay_ontology_term_id") - - if assay_ontology_term_id is not None: - # Convert to a regular Series if it's Categorical - assay_ontology_term_id = pd.Series(assay_ontology_term_id) + _assay_key = "assay_ontology_term_id" + includes_and_visium = False - # Check if any term is a descendant of ASSAY_VISIUM - try: - visium_results = assay_ontology_term_id.apply( - lambda term: ASSAY_VISIUM - in list(ONTOLOGY_PARSER.get_lowest_common_ancestors(ASSAY_VISIUM, term)) - ) - self.is_visium = visium_results.astype(bool).any() - except KeyError as e: - # This generally means the assay_ontology_term_id is invalid, but we want the error to be raised - # by our explicit validator checks, not this implicit one. - logger.warning(f"KeyError processing assay_ontology_term_id ontology: {e}") - self.is_visium = False - else: - self.is_visium = False + # only compute if not already stored + if self.is_visium is None and _assay_key in self.adata.obs.columns: + # check if any assay_ontology_term_ids are descendants of VISIUM + includes_and_visium = ( + self.adata.obs[_assay_key] + .apply(lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True)) + .any() + ) - return self.is_visium + # save state and return + self.is_visium = includes_and_visium + return includes_and_visium def _validate_spatial_image_shape(self, image_name: str, image: np.ndarray, max_dimension: int = None): """ diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 3646bc43..7268f332 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -477,6 +477,27 @@ def test_column_presence_assay(self, validator_with_adata): "to missing dependent column in adata.obs.", ] + @pytest.mark.parametrize( + "assay_ontology_term_id, is_descendant", + [("EFO:0010961", True), ("EFO:0022858", True), ("EFO:0030029", False), ("EFO:0002697", False)], + ) + def test_column_presence_in_tissue(self, validator_with_visium_assay, assay_ontology_term_id, is_descendant): + """ + Spatial assays that are descendants of visium must have a valid "in_tissue" column. + """ + validator: Validator = validator_with_visium_assay + + # reset and test + validator.reset() + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + validator._validate_spatial_tissue_position("in_tissue", 0, 1) + if is_descendant: + assert validator.errors == [] + else: + assert validator.errors == [ + "obs['in_tissue'] is only allowed for descendants of obs['assay_ontology_term_id'] 'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True." + ] + @pytest.mark.parametrize("reserved_column", schema_def["components"]["obs"]["reserved_columns"]) def test_obs_reserved_columns_presence(self, validator_with_adata, reserved_column): """ From 478648ef138d4d6d26ee43bd68c84414fe2292fc Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Tue, 26 Nov 2024 11:41:54 -0500 Subject: [PATCH 12/28] feat: update validation for uns['spatial'] (#1129) Co-authored-by: Evan Molinelli Co-authored-by: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> --- .../cellxgene_schema/validate.py | 42 ++-- .../tests/test_schema_compliance.py | 11 +- cellxgene_schema_cli/tests/test_validate.py | 201 +++++++++++++----- 3 files changed, 179 insertions(+), 75 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index f7892e6b..024a51ec 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -28,8 +28,15 @@ VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 4992 SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE = 2000 +SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM = 4000 -ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE = "descendants of obs['assay_ontology_term_id'] 'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True" +CONDITION_IS_VISIUM = "a descendant of 'EFO:0010961' (Visium Spatial Gene Expression)" +CONDITION_IS_SEQV2 = f"'{ASSAY_SLIDE_SEQV2}' (Slide-seqV2)" + + +ERROR_SUFFIX_SPATIAL = f"obs['assay_ontology_term_id'] is either {CONDITION_IS_VISIUM} or {CONDITION_IS_SEQV2}" +ERROR_SUFFIX_VISIUM = f"obs['assay_ontology_term_id'] is {CONDITION_IS_VISIUM}" +ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE = f"{ERROR_SUFFIX_VISIUM} and uns['spatial']['is_single'] is True" ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_FORBIDDEN = f"is only allowed for {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}" ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED = f"is required for {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}" ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_IN_TISSUE_0 = f"{ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE} and in_tissue is 0" @@ -95,9 +102,11 @@ def _is_supported_spatial_assay(self) -> bool: """ if self.is_spatial is None: try: - self.is_spatial = False - if self.adata.obs.assay_ontology_term_id.isin([ASSAY_VISIUM, ASSAY_SLIDE_SEQV2]).any(): - self.is_spatial = True + _spatial = ( + self._is_visium_including_descendants() + or self.adata.obs.assay_ontology_term_id.isin([ASSAY_SLIDE_SEQV2]).any() + ) + self.is_spatial = bool(_spatial) except AttributeError: # specific error reporting will occur downstream in the validation self.is_spatial = False @@ -1466,10 +1475,7 @@ def _validate_spatial_assay_ontology_term_id(self): # Validate assay ontology term ids are identical. term_count = obs["assay_ontology_term_id"].nunique() if term_count > 1: - self.errors.append( - "When obs['assay_ontology_term_id'] is either 'EFO:0010961' (Visium Spatial Gene Expression) or " - "'EFO:0030062' (Slide-seqV2), all observations must contain the same value." - ) + self.errors.append(f"When {ERROR_SUFFIX_SPATIAL}" ", all observations must contain the same value.") def _validate_spatial_cell_type_ontology_term_id(self): """ @@ -1599,10 +1605,7 @@ def _check_spatial_uns(self): uns_spatial = self.adata.uns.get("spatial") is_supported_spatial_assay = self._is_supported_spatial_assay() if uns_spatial is not None and not is_supported_spatial_assay: - self.errors.append( - "uns['spatial'] is only allowed for obs['assay_ontology_term_id'] values " - "'EFO:0010961' (Visium Spatial Gene Expression) and 'EFO:0030062' (Slide-seqV2)." - ) + self.errors.append(f"uns['spatial'] is only allowed when {ERROR_SUFFIX_SPATIAL}") return # Exit if we aren't dealing with a supported spatial assay as no further checks are necessary. @@ -1611,10 +1614,7 @@ def _check_spatial_uns(self): # spatial is required for supported spatial assays. if not isinstance(uns_spatial, dict): - self.errors.append( - "A dict in uns['spatial'] is required for obs['assay_ontology_term_id'] values " - "'EFO:0010961' (Visium Spatial Gene Expression) and 'EFO:0030062' (Slide-seqV2)." - ) + self.errors.append("A dict in uns['spatial'] is required when " f"{ERROR_SUFFIX_SPATIAL}.") return # is_single is required. @@ -1693,7 +1693,11 @@ def _check_spatial_uns(self): self.errors.append("uns['spatial'][library_id]['images'] must contain the key 'hires'.") # hires is specified: proceed with validation of hires. else: - self._validate_spatial_image_shape("hires", uns_images["hires"], SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE) + _assay_term = self.adata.obs["assay_ontology_term_id"].values[0] + _max_size = SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE + if is_ontological_descendant_of(ONTOLOGY_PARSER, _assay_term, "EFO:0022860", True): + _max_size = SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM + self._validate_spatial_image_shape("hires", uns_images["hires"], _max_size) # fullres is optional. uns_fullres = uns_images.get("fullres") @@ -1802,12 +1806,12 @@ def _is_visium_including_descendants(self) -> bool: # check if any assay_ontology_term_ids are descendants of VISIUM includes_and_visium = ( self.adata.obs[_assay_key] + .astype("string") .apply(lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True)) .any() ) + self.is_visium = includes_and_visium - # save state and return - self.is_visium = includes_and_visium return includes_and_visium def _validate_spatial_image_shape(self, image_name: str, image: np.ndarray, max_dimension: int = None): diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 7268f332..425086fc 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -4,6 +4,7 @@ import tempfile import unittest +from copy import deepcopy import anndata import fixtures.examples_validate as examples @@ -495,7 +496,7 @@ def test_column_presence_in_tissue(self, validator_with_visium_assay, assay_onto assert validator.errors == [] else: assert validator.errors == [ - "obs['in_tissue'] is only allowed for descendants of obs['assay_ontology_term_id'] 'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True." + "obs['in_tissue'] is only allowed for obs['assay_ontology_term_id'] is a descendant of 'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True." ] @pytest.mark.parametrize("reserved_column", schema_def["components"]["obs"]["reserved_columns"]) @@ -1673,11 +1674,16 @@ def test_should_warn_for_low_gene_count(self, validator_with_adata): Raise a warning if there are too few genes """ validator = validator_with_adata + # NOTE:[EM] changing the schema def here is stateful and results in unpredictable test results. + # Reset after mutating. + _old_schema = deepcopy(validator.schema_def.copy()) + validator.schema_def["components"]["var"]["warn_if_less_than_rows"] = 100 validator.validate_adata() assert validator.warnings == [ "WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix." ] + validator.schema_def = _old_schema @pytest.mark.parametrize( "df,column", @@ -2198,7 +2204,6 @@ def test_obsm_values_no_X_embedding__non_spatial_dataset(self, validator_with_ad ] assert validator.is_spatial is False assert validator.warnings == [ - "WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.", "WARNING: Embedding key in 'adata.obsm' harmony is not 'spatial' nor does it start with 'X_'. " "Thus, it will not be available in Explorer", "WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.", @@ -2248,7 +2253,6 @@ def test_obsm_values_warn_start_with_X(self, validator_with_adata): validator.adata.obsm["harmony"] = pd.DataFrame(validator.adata.obsm["X_umap"], index=validator.adata.obs_names) validator.validate_adata() assert validator.warnings == [ - "WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.", "WARNING: Embedding key in 'adata.obsm' harmony is not 'spatial' nor does it start with 'X_'. " "Thus, it will not be available in Explorer", "WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.", @@ -2282,7 +2286,6 @@ def test_obsm_values_key_start_with_number(self, validator_with_adata): "'pandas.core.frame.DataFrame'>').", ] assert validator.warnings == [ - "WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix.", "WARNING: Embedding key in 'adata.obsm' 3D is not 'spatial' nor does it start with 'X_'. " "Thus, it will not be available in Explorer", "WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.", diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index accc6b86..9ab30177 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -15,6 +15,8 @@ ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_FORBIDDEN, ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_IN_TISSUE_0, ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED, + SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE, + SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM, Validator, validate, ) @@ -423,10 +425,9 @@ def test__validate_spatial_type_error(self, spatial): # Confirm key type dict is required. validator.validate_adata() - assert validator.errors assert ( - "A dict in uns['spatial'] is required for obs['assay_ontology_term_id'] values 'EFO:0010961' (Visium Spatial Gene Expression) and 'EFO:0030062' (Slide-seqV2)." - in validator.errors[0] + validator.errors[0] + == "ERROR: A dict in uns['spatial'] is required when obs['assay_ontology_term_id'] is either a descendant of 'EFO:0010961' (Visium Spatial Gene Expression) or 'EFO:0030062' (Slide-seqV2)." ) def test__validate_spatial_is_single_false_ok(self): @@ -448,25 +449,42 @@ def test__validate_spatial_forbidden_if_not_visium_or_slide_seqv2(self): # Confirm spatial is not allowed for 10x 3' v2. validator._check_spatial_uns() - assert len(validator.errors) == 1 - assert ( - "uns['spatial'] is only allowed for obs['assay_ontology_term_id'] values " - "'EFO:0010961' (Visium Spatial Gene Expression) and 'EFO:0030062' (Slide-seqV2)." in validator.errors[0] - ) + assert validator.errors == [ + "uns['spatial'] is only allowed when obs['assay_ontology_term_id'] is either " + "a descendant of 'EFO:0010961' (Visium Spatial Gene Expression) or 'EFO:0030062' (Slide-seqV2)" + ] - def test__validate_spatial_required_if_visium(self): + @pytest.mark.parametrize( + "assay_ontology_term_id, is_descendant", + [("EFO:0010961", True), ("EFO:0022858", True), ("EFO:0030029", False), ("EFO:0002697", False)], + ) + def test__validate_spatial_required_if_visium(self, assay_ontology_term_id, is_descendant): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() - validator.adata.uns = good_uns.copy() + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id - # Confirm spatial is required for Visium. - validator._check_spatial_uns() - assert len(validator.errors) == 1 - assert ( - "A dict in uns['spatial'] is required for obs['assay_ontology_term_id'] values " - "'EFO:0010961' (Visium Spatial Gene Expression) and 'EFO:0030062' (Slide-seqV2)." in validator.errors[0] - ) + if is_descendant: + # check pass if 'spatial' included + validator.adata.uns = good_uns_with_visium_spatial.copy() + validator._check_spatial_uns() + assert len(validator.errors) == 0 + validator.reset() + + # check fail if 'spatial' not included + validator.adata.uns = good_uns.copy() + validator._check_spatial_uns() + assert validator.errors == [ + "A dict in uns['spatial'] is required when obs['assay_ontology_term_id'] is " + "either a descendant of 'EFO:0010961' (Visium Spatial Gene Expression) or 'EFO:0030062' (Slide-seqV2)." + ] + validator.reset() + else: + # check fail if 'spatial' included + validator.adata.uns = good_uns_with_visium_spatial.copy() + validator._check_spatial_uns() + assert len(validator.errors) == 1 + validator.reset() def test__validate_spatial_required_if_slide_seqV2(self): validator: Validator = Validator() @@ -476,11 +494,9 @@ def test__validate_spatial_required_if_slide_seqV2(self): # Confirm spatial is required for Slide-seqV2. validator._check_spatial_uns() - assert len(validator.errors) == 1 - assert ( - "A dict in uns['spatial'] is required for obs['assay_ontology_term_id'] values " - "'EFO:0010961' (Visium Spatial Gene Expression) and 'EFO:0030062' (Slide-seqV2)." in validator.errors[0] - ) + assert validator.errors == [ + "A dict in uns['spatial'] is required when obs['assay_ontology_term_id'] is either a descendant of 'EFO:0010961' (Visium Spatial Gene Expression) or 'EFO:0030062' (Slide-seqV2)." + ] def test__validate_spatial_allowed_keys_error(self): validator: Validator = Validator() @@ -496,16 +512,26 @@ def test__validate_spatial_allowed_keys_error(self): "More than two top-level keys detected:" in validator.errors[0] ) - def test__validate_is_single_required_visium_error(self): + @pytest.mark.parametrize( + "assay_ontology_term_id, is_descendant", + [("EFO:0010961", True), ("EFO:0022858", True), ("EFO:0030029", False), ("EFO:0002697", False)], + ) + def test__validate_is_single_required_visium_error(self, assay_ontology_term_id, is_descendant): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id validator.adata.uns["spatial"].pop("is_single") - - # Confirm is_single is identified as required. validator._check_spatial_uns() - assert validator.errors - assert "uns['spatial'] must contain the key 'is_single'." in validator.errors[0] + + if is_descendant: + # if spatial, MUST specify `is_single` + assert "uns['spatial'] must contain the key 'is_single'." in validator.errors[0] + else: + # if not spatial, MUST NOT speciffy `is_single` + assert validator.errors == [ + "uns['spatial'] is only allowed when obs['assay_ontology_term_id'] is either a descendant of 'EFO:0010961' (Visium Spatial Gene Expression) or 'EFO:0030062' (Slide-seqV2)" + ] def test__validate_is_single_required_slide_seqV2_error(self): validator: Validator = Validator() @@ -560,19 +586,36 @@ def test__validate_library_id_forbidden_if_visium_or_is_single_false(self): assert len(validator.errors) == 1 assert f"uns['spatial'][library_id] {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_FORBIDDEN}." in validator.errors[0] - def test__validate_library_id_required_if_visium(self): + @pytest.mark.parametrize( + "assay_ontology_term_id, is_descendant", + [("EFO:0010961", True), ("EFO:0022858", True), ("EFO:0030029", False), ("EFO:0002697", False)], + ) + def test__validate_library_id_required_if_visium(self, assay_ontology_term_id, is_descendant): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() - validator.adata.uns["spatial"].pop(visium_library_id) - # Confirm library_id is identified as required. - validator._check_spatial_uns() - assert validator.errors - assert ( - f"uns['spatial'] must contain at least one key representing the library_id when {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}." - in validator.errors[0] - ) + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + if is_descendant: + # if spatial, `library_id` must exist + validator._check_spatial_uns() + assert len(validator.errors) == 0 + validator.reset() + + # if spatial, but missing from `uns` + validator.adata.uns["spatial"].pop(visium_library_id) + validator._check_spatial_uns() + assert validator.errors == [ + f"uns['spatial'] must contain at least one key representing the library_id when {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}." + ] + else: + # if not spatial, MUST NOT define `library_id` + validator.adata.uns["spatial"][visium_library_id] = {"images": []} + validator._check_spatial_uns() + # Report the most general top level error + assert validator.errors == [ + "uns['spatial'] is only allowed when obs['assay_ontology_term_id'] is either a descendant of 'EFO:0010961' (Visium Spatial Gene Expression) or 'EFO:0030062' (Slide-seqV2)" + ] @pytest.mark.parametrize("library_id", [None, "invalid", 1, 1.0, True]) def test__validate_library_id_type_error(self, library_id): @@ -610,7 +653,11 @@ def test__validate_images_required_error(self): assert validator.errors assert "uns['spatial'][library_id] must contain the key 'images'." in validator.errors[0] - def test__validate_images_allowed_keys_error(self): + @pytest.mark.parametrize( + "assay_ontology_term_id, is_descendant", + [("EFO:0010961", True), ("EFO:0022858", True), ("EFO:0030029", False), ("EFO:0002697", False)], + ) + def test__validate_images_allowed_keys_error(self, assay_ontology_term_id, is_descendant): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() @@ -730,34 +777,84 @@ def test__validate_images_image_is_shape_error(self, image_name): "for example) or 4 (RGBA color model for example) for its last dimension" in validator.errors[0] ) - def test__validate_images_hires_max_dimension_greater_than_error(self): + @pytest.mark.parametrize( + "assay_ontology_term_id, hi_res_size, image_max", + [ + ("EFO:0022858", 2001, SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE), + ("EFO:0022860", 4001, SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM), + ], + ) + def test__validate_images_hires_max_dimension_greater_than_error( + self, assay_ontology_term_id, hi_res_size, image_max + ): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() - validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = np.zeros((1, 2001, 3), dtype=np.uint8) + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = np.zeros( + (1, hi_res_size, 3), dtype=np.uint8 + ) # Confirm hires is identified as invalid. validator._check_spatial_uns() - assert validator.errors - assert ( - "The largest dimension of uns['spatial'][library_id]['images']['hires'] must be 2000 pixels" - in validator.errors[0] - ) + assert validator.errors == [ + f"The largest dimension of uns['spatial'][library_id]['images']['hires'] must be {image_max} pixels, it has a largest dimension of {hi_res_size} pixels." + ] - def test__validate_images_hires_max_dimension_less_than_error(self): + @pytest.mark.parametrize( + "assay_ontology_term_id, hi_res_size, size_requirement", + [ + ("EFO:0022858", SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE, SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE), + ("EFO:0022858", SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM, SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE), + ("EFO:0022860", SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE, SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM), + ( + "EFO:0022860", + SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM, + SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM, + ), + ], + ) + def test__validate_images_hires_max_dimension(self, assay_ontology_term_id, hi_res_size, size_requirement): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() - validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = np.zeros((1, 1999, 3), dtype=np.uint8) + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = np.zeros( + (1, hi_res_size, 3), dtype=np.uint8 + ) # Confirm hires is identified as invalid. + validator.reset() validator._check_spatial_uns() - assert validator.errors - assert ( - "The largest dimension of uns['spatial'][library_id]['images']['hires'] must be 2000 pixels" - in validator.errors[0] + if hi_res_size == size_requirement: + assert validator.errors == [] + else: + assert validator.errors == [ + f"The largest dimension of uns['spatial'][library_id]['images']['hires'] must be {size_requirement} pixels, it has a largest dimension of {hi_res_size} pixels." + ] + + @pytest.mark.parametrize( + "assay_ontology_term_id, hi_res_size, image_max", + [ + ("EFO:0022858", 1999, SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE), + ("EFO:0022860", 3999, SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM), + ], + ) + def test__validate_images_hires_max_dimension_less_than_error(self, assay_ontology_term_id, hi_res_size, image_max): + validator: Validator = Validator() + validator._set_schema_def() + validator.adata = adata_visium.copy() + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = np.zeros( + (1, hi_res_size, 3), dtype=np.uint8 ) + # Confirm hires is identified as invalid. + validator._check_spatial_uns() + assert validator.errors == [ + f"The largest dimension of uns['spatial'][library_id]['images']['hires'] must be {image_max} pixels, it has a largest dimension of {hi_res_size} pixels." + ] + def test__validate_scalefactors_required_error(self): validator: Validator = Validator() validator._set_schema_def() @@ -861,8 +958,8 @@ def test__validate_assay_type_ontology_term_id_not_unique_error(self): validator._validate_spatial_assay_ontology_term_id() assert validator.errors assert ( - "When obs['assay_ontology_term_id'] is either 'EFO:0010961' (Visium Spatial Gene Expression) or " - "'EFO:0030062' (Slide-seqV2), all observations must contain the same value." + "When obs['assay_ontology_term_id'] is either a descendant" + " of 'EFO:0010961' (Visium Spatial Gene Expression) or 'EFO:0030062' (Slide-seqV2), all observations must contain the same value." ) in validator.errors[0] def test__validate_assay_type_ontology_term_id_not_unique_ok(self, valid_adata): From 7f840ce1796bc8286ed888bb6351e35fef646467 Mon Sep 17 00:00:00 2001 From: Joyce Yan <5653616+joyceyan@users.noreply.github.com> Date: Wed, 27 Nov 2024 10:55:01 -0800 Subject: [PATCH 13/28] feat: add genetic ancestry fields for schema 5.3 (#1132) --- .../schema_definitions/schema_definition.yaml | 12 ++ .../cellxgene_schema/validate.py | 107 ++++++++++++++++ .../tests/fixtures/examples_validate.py | 90 +++++++++++++ .../tests/fixtures/h5ads/example_valid.h5ad | Bin 575888 -> 593864 bytes .../tests/test_schema_compliance.py | 118 ++++++++++++++++++ 5 files changed, 327 insertions(+) diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml index 28a3fad5..d14a0442 100644 --- a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml +++ b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml @@ -582,3 +582,15 @@ components: - "cell culture" - "organoid" - "tissue" + genetic_ancestry_African: + type: genetic_ancestry_value + genetic_ancestry_East_Asian: + type: genetic_ancestry_value + genetic_ancestry_European: + type: genetic_ancestry_value + genetic_ancestry_Indigenous_American: + type: genetic_ancestry_value + genetic_ancestry_Oceanian: + type: genetic_ancestry_value + genetic_ancestry_South_Asian: + type: genetic_ancestry_value diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 024a51ec..01a2d140 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -416,6 +416,109 @@ def _count_matrix_nonzero(self, matrix_name: str, matrix: Union[np.ndarray, spar self.number_non_zero[matrix_name] = nnz return nnz + def _validate_genetic_ancestry(self): + """ + Performs row-based validation of the genetic_ancestry_X fields. This ensures that a valid row must be: + - all float('nan') if organism is not homo sapiens or info is unavailable + - sum to 1.0 + + Additionally, verifies that all rows with the same donor_id must have the same genetic ancestry values + """ + ancestry_columns = [ + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", + ] + + organism_column = "organism_ontology_term_id" + donor_id_column = "donor_id" + + # Skip any additional validation if the genetic ancestry or organism columns are not present + # An error for missing columns will be raised at a different point + required_columns = ancestry_columns + [organism_column, donor_id_column] + for column in required_columns: + if column not in self.adata.obs.columns: + return + + donor_id_to_ancestry_values = dict() + + def is_valid_row(row): + ancestry_values = row[ancestry_columns] + + # If ancestry values are different for the same donor id, then this row is invalid + donor_id = row[donor_id_column] + if donor_id in donor_id_to_ancestry_values: + if not donor_id_to_ancestry_values[donor_id].equals(ancestry_values): + return False + else: + donor_id_to_ancestry_values[donor_id] = ancestry_values + + # All values are NaN. This is always valid, regardless of organism + if ancestry_values.isna().all(): + return True + + # If any values are NaN, and we didn't return in the earlier all NaN check, then + # this is invalid + if ancestry_values.isna().any(): + return False + + # If organism is not homo sapiens, and we didn't return in the earlier all NaN check, + # then this row is invalid + if row[organism_column] != "NCBITaxon:9606": + return False + + # The sum of genetic ancestry values should be approximately 1.0 + if ( + ancestry_values.apply(lambda x: isinstance(x, (float, int))).all() + and abs(ancestry_values.sum() - 1.0) <= 1e-6 + ): + return True + + return False + + invalid_rows = ~self.adata.obs.apply(is_valid_row, axis=1) + + if invalid_rows.any(): + invalid_indices = self.adata.obs.index[invalid_rows].tolist() + self.errors.append( + f"obs rows with indices {invalid_indices} have invalid genetic_ancestry_* values. All " + f"observations with the same donor_id must contain the same genetic_ancestry_* values. If " + f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then all genetic" + f"ancestry values MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' " + f"for Homo sapiens, then the value MUST be a float('nan') if unavailable; otherwise, the " + f"sum of all genetic_ancestry_* fields must be equal to 1.0" + ) + + def _validate_individual_genetic_ancestry_value(self, column: pd.Series, column_name: str): + """ + The following fields are valid for genetic_ancestry_value columns: + - float values between 0 and 1 + - float('nan') + """ + if column.dtype != float: + self.errors.append(f"Column '{column_name}' in obs must be float, not '{column.dtype.name}'.") + return + + def is_individual_value_valid(value): + if isinstance(value, (float, int)) and 0 <= value <= 1: + return True + # Ensures only float('nan') or numpy.nan is valid, None is invalid + if isinstance(value, float) and pd.isna(value): + return True + return False + + # Identify invalid values + invalid_values = column[~column.map(is_individual_value_valid)] + + if not invalid_values.empty: + self.errors.append( + f"Column '{column_name}' in obs contains invalid values: {invalid_values.to_list()}. " + f"Valid values are floats between 0 and 1 or float('nan')." + ) + def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: str, df_name: str): """ Validates the "is_feature_filtered" in adata.var. This column must be bool, and for genes that are set to @@ -505,6 +608,9 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co if column_def.get("type") == "feature_is_filtered": self._validate_column_feature_is_filtered(column, column_name, df_name) + if column_def.get("type") == "genetic_ancestry_value": + self._validate_individual_genetic_ancestry_value(column, column_name) + if "enum" in column_def: bad_enums = [v for v in column.drop_duplicates() if v not in column_def["enum"]] if bad_enums: @@ -781,6 +887,7 @@ def _validate_dataframe(self, df_name: str): f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with " f"zero observations. These categories will be removed when `--add-labels` flag is present." ) + self._validate_genetic_ancestry() categorical_types = {type(x) for x in column.dtype.categories.values} # Check for columns that have illegal categories, which are not supported by anndata 0.8.0 # TODO: check if this can be removed after upgading to anndata 0.10.0 diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py index 470c165c..accbecfc 100644 --- a/cellxgene_schema_cli/tests/fixtures/examples_validate.py +++ b/cellxgene_schema_cli/tests/fixtures/examples_validate.py @@ -48,6 +48,12 @@ "HsapDv:0000003", "donor_1", "nucleus", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ "CL:0000192", @@ -62,6 +68,12 @@ "MmusDv:0000003", "donor_2", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -78,6 +90,12 @@ "development_stage_ontology_term_id", "donor_id", "suspension_type", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) @@ -144,6 +162,12 @@ "donor_1", "na", 0, + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ 2, @@ -161,6 +185,12 @@ "donor_2", "na", 1, + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -180,6 +210,12 @@ "donor_id", "suspension_type", "in_tissue", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) @@ -203,6 +239,12 @@ "HsapDv:0000003", "donor_1", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ "CL:0000192", @@ -217,6 +259,12 @@ "MmusDv:0000003", "donor_2", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -233,6 +281,12 @@ "development_stage_ontology_term_id", "donor_id", "suspension_type", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) @@ -255,6 +309,12 @@ "HsapDv:0000003", "donor_1", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ "CL:0000192", @@ -269,6 +329,12 @@ "MmusDv:0000003", "donor_2", "na", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -285,6 +351,12 @@ "development_stage_ontology_term_id", "donor_id", "suspension_type", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) @@ -493,6 +565,12 @@ "tissue:1", "sre:1", "development_stage:1", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], [ "cell_type:1", @@ -503,6 +581,12 @@ "tissue:1", "sre:1", "development_stage:1", + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), + float("nan"), ], ], index=["X", "Y"], @@ -515,6 +599,12 @@ "tissue_ontology_term_id", "self_reported_ethnicity_ontology_term_id", "development_stage_ontology_term_id", + "genetic_ancestry_African", + "genetic_ancestry_East_Asian", + "genetic_ancestry_European", + "genetic_ancestry_Indigenous_American", + "genetic_ancestry_Oceanian", + "genetic_ancestry_South_Asian", ], ) diff --git a/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad b/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad index ec5f0aee29d9a739fe0048b38b37689e2ed5f6b0..a1b121bdf605779d0a34649907a8467a31fa1076 100644 GIT binary patch delta 41271 zcmeHwd3aP+()YdH^zCjIP1urj3c-vu3^;St-5tj_wB>;J>PtPd~g5prk@Au z{OZ&>r_NTl>U8Ja{Raj%KR9TAlC}9tXVtL1)&{Gy{)Ek9by%HM3*()RE1e2`8mpQz zpGm;nSf9ciRp%>8se$)foCm(u(#It%w(B_}zvW}bb;p=`jdDrfhsybWa_BjE-`lI2 z4nFNT8KGKkeI?reO0-%rPqo}Ju2l7pE47TBt6KcGzHS*DQL#tU3s~^~?)b0Tz|?S8 z6ONN`EaEFn`9mZ;Si)CHcpzd@Jxeq=ND>$&;gN{1#D7BmT7S`dmiZ3s(}|BP{u3Wr z_5O{_?BXApY18|6^IqHLh!P=Boii)k4Ka0EkI1p)# z@|QkfJG~M1_tX;9e5miKMX594R8P&N&Vma_DrRE|`b0BPRqk*=)-P*l z*80txii2HQ06D$1A?iXX>7^yBiy(_eZ!uKuwMVH-pp1reDKt@DAvk+!nQ9RnrUJ{L zozQX^+8fad$nTAkE1{&fmJAJjvqmh;}ev%Bj^9nocuCdK@8r#{S`1d7f5e~HrT)I>1=N`e;RC6ii^q(15 z8r7b5u~}&`^4R+fR6>J`i1{Yx_H7x`dOLz%lQdH#AD3a+E zI3LrxxiRXSiC8xW2A+l3+`-w(3`L>cDk`8IY}Sx=C~Y>8PaDvjUx1i4iO8Q1zYNnR z;Xg`gyB(5;t!BrF-Bb+y(G*Q%Q0D+;kZeF`7qQ8#8&*=gX~ zhnep%GD_bKm8&*z!+Xq+Fy!1HhRPKfxcnpLmvgh@m4qik1$_oCKF0iN7F1fF4wZ8} z!#L?#?8h89dW|;QlA^*_?^|LaG8y;tQOHWxR;&A=CRtkwZSPy6oed9gnc}LZC@Ae^ z?*jL^HAjUTb&khos>gG7vV^aZ@Nfx_K%8JIKSat;lJbX!;tb{LP=+#0!toOBPZ*AS zG}kn9yNQz60L12g4V3U8310=Txiy!~No>eX(Oi+{(%q3g2(}?*1~8r9AxH%$x`d0b{ZO2VThJVwG} z3B!f4+Sh7-IGc?l+#fDxYfB@|m693k4<$J`)cr9j+9vyM7DM~4nDfcJ_lM(HEYZxS z$A-CBp)PhYN1LJc_vc=x&9E#sXYZE8_eglJg!f7KE(!04gX6VFLQ>~a%d%~F;u>H76 z+90(TC9aK@$@2O~!g1 zxma%;cyf^+4<$tULk$rZoF$R~k&}@mLjFRmX7OZgpgI6{APXE*v3&Dn)G!-Uho)df zS3%JftuKt7q9v+>DV?bffx|?S;0%$W;GBx&YRDln47Ly%4)sJvzy%~wIvuNZPs3_! zDOF6VR!SYERP+o?wNNULQW4WJwUbglN;Oidl2RR%N|=mp;0x1nw%mCL+bE-$Qb{u~ zb(B(tlxm?=HKii*F?AYK>PWaa1C<&Do;)O@p@hg7s39^I+81Ltzs|#MWUdnuV$8nV2e|l#fz7DOFCX21?aZ>ijHpM&F-_ z&S>;(Om&@wDc@|YIcpZyTs|A+if5r*Ev2d{)k3+aFjt)c7iQs7&4ZZPNb(_<$V}Km zWEMO|WHy{7G6$SgF6CL(lL^eYcksIM6k((5fj5bM|jr-R+M>N+Y7q+D z1!ctAPmT0~7l>5_=ZbKg?}o@_xIzn;<0Si*p+5IO88Xy^P(%DS;MGp8M^|H8<*RUQJ`Rsz z{-I*bKEDc^a#Jds0=~G_81NMm^sUB>{gf(Sjf2=ssXF3yP^y(uNhO$yD8^I)rLr-l zJ_AL?sL1c3su&e{7ETd*4lWY$!_X2W4KSa`aoAOYBF{q;krPm}290BnHE0}P0M8m^ zy$B^lUZR{#^<_9qq!A*oN74kjL{7pEBCo)4BCkR_k=J184M<*xA|h|VVIs|B_cPT~ z;9QI350FFTP1r)@k5EtKEx17BZAe^)i>r1$uBnoBxTadyW9N>p!$F8HC8M%V8w`1+ z)b;hq+ey61>yg(;yh`GAP^y7a?lMfZQL4BM>l<2%^&O>D5v5uvx4IN_BQ_xK4D!^! z!o^b5;sfxMA^8wWi2My|hi64TS7XxUV+iqWT1e zZp6HQLJ<-;k8G$V{&_e<{0rdRgyb{GK?2R2F*|#WW{0~rp;i6@>WS;U5xIG5@oYfd zjJEMBNZd@Lb0hY8&1P+|+6Gm~P}|`Yk#9h~5y`itOkbFBBi8dBlo9KDc#Oyoa1IF+ zmt(C-H=!yWkbe_0T8L4N42al*sneA5>8TQ2mfg#7{Zx;Jlf3` z=~gYuzyCJv3(FO$7}1DmJz^K!d51RLTD*(mbf`ONb3C^b3vi^Hrot7wv^Ok!Gf&NaCon_ z^w2rmG$Amcs;SrAj%aWk(IS)5GCWwneg;ak2mObiX)~sepK-0*<955VG7wv#>1mq_ z9oTbE+oI5c-Ek1dR%2v`GGaP(6eA9^D4LEp|pK5oLS z*)y@R^vrZTeTKxlaZr13dslD^S8{t1jL@^C_Ha27=>_d~qb|K6@&J-PT&^$YMRB=k zMloEjAEQ_}Lj~fXok+Z14vG(IhaDboN*WHzAYEZxD;MrTfx+OUE)RjBM3T6fp{g3v(j-WQb@HZ zs63=O)KpdzmljULp=x~G>^!JNgS%SGgFMip(a$UaRBRONA~Krw8>8!R=rHa_5=SSK zh78=qD1X`(pnLUd&y0dASO4S~bRv(@>BC0rYA#oIG}5RJ8Y3~wdl;AW zG*)0bOVk|^8sssfl^D7PGj#^Kl(ag~rBsig`g0(W$Xu>u9*Y$4D9>lKfK^_oqeJ&$ zb?1*@yNlu65v`Baqby-%^X?PM7IC>{Ty8m|6;OY#en3pa3K91U;bJ`-3hx(cuF)AW z*Be8@yHHU!m2BL&d8MM<06Xr(de*|>`%v$7toM3uqm=bVcaoOt25x^NqfLx9GrCbn z&^XMVz=N|z=R?GByD?ffv*0!^ww=)}P!ox*RxqkOaLzGA-EqL_Opf`Lp)Mce;$B?3 zeB}){Y*cQAGxwvdz3spb=MeSw1NDw%_16dToyqDQu;T&D*m>X#f?WsHu0zy2A^)tc zpXT-8;@u6A&tp0xE6bzof&Afi7y6E8-bKTJzGH_i#-5g*o|>ZEMZg6KZ)%B-qdZiO zXWCk1_i2r=;Yn>lKv_CM7Rnxlb5EkO z9pHHimGwP&nX>)h#X5Y>RPTW?&uer0iw@HXicm9nVyml(fx^F0G|;~UQqbDGTRZTR zb%ZsdLGwQ`!rsSX^*xKn(T~^`n}=qj-nBSm&Tcj-nmvHmWhqoWWSiun2Zzw*7EG&n z5>F;58voHp^sjgx7eix~gg#Hg+uCv%=GW%;6@?Q`!tOo%$SpV?c;s3YziE9{A2+S9 zq>y5^jmkCt9VckkYG35ll@01kT9dW0QG@t9w1VEo za11x8+6bPPao(a~$ICcx#V_ef`WqUx@#+%MHGD>dd>@;Fn6`n)r)RktUnAjU3FGrf z!S8_gzoLy%{nf8%uUfAZRZYb#%&Tu2t@a=nMDSOHba@2yIHj$%C?{dtDXk!|K+-}6 z5}##(fh<138+GT4Ap@DA7r=G4xb(gLoHuC@i{Ij_6k8zY9UR4^w=Y}$S@70r?F}n-2i7+`<^-1i7?YAoZ?clUw{#`RB}+c# zm=ow;Hh&%S*`G9@_2?_1%FxqND1(P)f7UW*6(n0Juv z&ZFMV;JJW$N1wk;@0sx8XWCk8%jX)5KBLVGnESb=N?PFX=i11jVtdZPtnThP{ik zc_rKCJL;S6RU!KuZBMT69cGO;3JL6!Fs&M@ROC;=^k5U7Ue$Dh*0^~3(^m#!l|_HT z8^rr*$ZQpa%+`D{Ti^3+eXq~fk33s#Q2!&&mg9@i*`hBX#HPU6>fqVBNVC;(*=#L@ z9=mY1d|zs?eV0}se2^s}i;TXT2&f&{-yv!jhNxYnt6d6BKcm{`{p~-K+R>L-?K&vC zglZRKljcRX%%ryaV2yo-y7OSWeTcQ?(q&37=dDrK%v*!Jf-8F1gI>WZlfErK^DuY( zTh|w@_1h>K*iFT{rsc4si#=Iw^VfH=M_JGipVI8>D+=9QG++V){u=Y?+eG@b{UcN^ zIJ3;pHstCIRYZM3)mW}kuT4?1`Ehft?nw|Y`21nPJfnuvF1$=dh1CXbfOfn64r@bq zJ51T9y=<-hiif(du@_3PPO(C~-SFoYcWBeB`@7jifUmat6T8|Ms)12jL(iYJ*7INU zo(}NW>9a2Ur1e;>SSCSLn!4Mk1PYXfER!<*krWfHcQi6#tiEBf|4LdnCGC<`f!M#> zTwQ&U+)Je^ENZpoi%x7Xb>b-caGx)?kM@^7s}-mfm;F%3_-PWB&_9>uOFv1vJYW9_ zQs5Uzm*t24;%H{zCr6hFN67jYFG6|8-jXoV7{hK+Wy_hD6lMn9Yi z=(8V21$gy^PXTT}j8hP11v>YC80Q}MK23l4RzJ5NP6ayme^?*CO6UF$=jspt{-+}{ z(+EJqtF{CveMIZ|fAyaShWO7S#M>0&if^WnYT$oX--wX<=%DFRsy%qnbkqMsRhgGn z^>Ui2sxALnfRGbXiwF?5;&_QDr-Kf8@i)gkH07uWp5iFp>DFk#>UbTVH(+Cb9XgC} z1fvsm*uf@1Y5WtN%nocZpkf@o33oks6o+DS_&b6BXr`uwbVSV5wvd_H4r_bZMZb z9?~9rGyi)P=-mHdoO@h}o%_FY|A%t~I`@C){txE}@bkCx`5&$k=-mI|T>YKT|IYnS zPyg^mK%M))bN`2P1UmPB=l&1p2;lk)m+PgsOY1+J3h4VUj0$w_|8TB;^gqKm_tf*w zUlVoi|8R}~+n+F+;4mxDx&OmB_qzQH=j!M4hj5NS=kvew`5(>^!1Wha4~W)(I2F+M zUlk6ho&Nv-_UHc+T7Lik3=rf!XiJD`n9qODQow6nDQIPUzP{;GdLx6g zU@tR@L5;L8tPpo|>_mCGfJ)#yFs;9!!1#NYUnA~g%D)ZM{Y?0FOzUq*Fyt!{>n|)Y z@D9Y(u&AG|9K)-~y9OAy!dt(wjSbvdunR`?wBscii|r8qv29*oabw7>SWf(fk+KlF z_Oeg39z92YDMYvWJBAf3T3u?Zdaizg;#`5$@_?=#+^0zjJb$_k?mqud9e2*YJ z2&E6(rUou5xedN_X|c`@qn6?WiJHrLbfFzaCTlC54U4#uVyM}x&C^T93x#Rmgu=zU zS&nbAf}^k5f>)v3S+zK};(QwmG?I$=q1VcYifF{HG+`Q1Gp>{{-Hm0=C-ZN{^tW1O zj8|^)OEE-)+a!VQ5~kZA%oS{u@Fodwmhg=dzKNcY<_gOtfh`g)lJGJKFPHEN3FC!) zA?>Y_hiJ7VP%L4()WfXs8VS=ifaZL<5XX$^!W=WMSSK;oOSn|RWfC4M;cN-#Ncg%? zT%pkIPG*hhHXbvkJ0Hz>qJ$?&m~QVe%jZg%u4fA7SILZLnla7)EHeYsbTN{-09~A9#&pM$8Pk13W=vMajOR<3t_~9U)V@M5 zFAxmE^zs5Trkj||m>w%;yi~%41{Rk0P6_W0V!eH3PmqBrMcFIieG`j*t)DHxU`#jgm@6c%89OBGlrULQvwSxRcNe(AxR$DiU>KPG4%l1)rOh}( z!jTg0DPer@g_NfUC)HOWhLE1L<_74=Y{pR%j+QVzdCl_lq&8!EvYT-n!iVOsw}ff16xH4;vi zFkO*o9#W5lQzV=!;WWe*<_gj!feZ;}O4uvmED2vL;gJ#^Mej^8D==CT7z4+~*hkpC zX{p{c#R6we*kWy2?hL&1x#~r_`86}mt#ko(EZ+8NgZjVVZLgYfCv3VWL$O0UUiM0N zh5sIjlgh-{v$9g$X?V$0`}4NAE}5xHH#kSdyF=A`DAa>ctj&wJEMEa}TW*g8T*~SMmM0UT>OL8Em98P=O&( zG!Ba=Q8^dhFpem$OGb)vHRS(-jA7t8j$}AAO~PTxNbxEoK%Hvur+Mh2-D@CcJQhxd zb}Ee5A<`ARnVA_GcwMyT-^fa#mRxv6wBGxa)HEf{NV+{rIwVfO(iw2(1C+^x`WL8! z9-LAyL|#Hx7FFm{uO)(g@}wvudD^nFlu@8g#KNPY{zDWQV~C_=D){CSDw|D<#f2{| z!P$wmrQ;33*U=33(^9fhGn8>~j!KV*s=uMo1RA1#T3TwVTbXFAk_>l-TbV>Ux#+S< zQq6<9DY-N*aa~f4 zk47?0$%h&rRxp!SR%TjiT8c7@RvRwtbZ@#c8}eVm(sMZ9ot}>Qa~Y(%)3e;lJSfV= z3JMJMGjY+*=ki%;DJjYVRy)(Fs81#<4h{0deuB^_68C9QW{&N#d~pk_Me{t9B~2K!t2ZtmOovh3UWBJE!@ zx&vxRWG9R4V&@)`}j8OyZAQj{j43|hFwL#h3`$l$#AI$ zg7)%w=apozncOZL@oq{+TE zuzfww_VxLIeLcbU72l;o_VvY}eSOJfUtebX+Q{~`iN@5huP52QzCtaz@Xabna6h7b zea%SX26~luYcre!&Ukg+t;_*zP@9KpnW~f_Voz{dd==Sx$*uK8Y_Vq8k`gQyIp0P@F``W_x^{mOhz8|o!e`Wjn0o&IPsdl!n ze+$^xkEk*iUPm6ZujhjH^<%cLf9I9O_Vpho`}zsn*MD+8+t*JSuzfwx_Vt3HKH1mL zO!oD2lYRXnXkWkND%ie$#Y;iAuV1r$ZR6^6``R9~uitPdbo=^k(7t|WtVP|vejl{2 zKd^oMk+Kc@imz%R1EAa2izfT}FRod)uRjIt>(3_p`U~6Fe{&7GeZ8dDVAt&{zOv=c z?`-|y^h-9cwPUBh9roZQ`q{7ACR>kU9s2FZqK=*VRW^wN?!qd?m7^i2YkcF45ZevM&f#6k&3pL( z&*?kF!CY_SVjKhZ(P6x}Cx^Udp>MYF_G&&YtDvY(h>^1_KIF2B%0*x$un4>a7J-?- zRMuQx1acx@M2d(xHWs0x5D<}~z#>uva9S#VM#6uR@Shn6W8?pl1m2bKUnKmVgj*zhR>JQ~_^&!P#l{~< z0v}5FZxa4U!mScMC*hAJ{CC2%{>38tha~Wcg#RhwPbGX_!WSg`nS?(_91F@-_;Cq8A>rSJ z;tJ(hC_{Nt!cR%KPQvvPepk@uL!p#ysCE-6v_)Q7_F^C6aQc>Ot zGIad5gx`S?>+y^%BQ*^VeJsJS7thG(yh9Ir@#Sa~_S)FL)5BhkU3v<8?JTK>y$%k0 z@vw{v>tSzKqaq4>yK>ox|Q9rm*)4c6&MOy^_P;2zGV#uovH-D8k;J9QO8trczX@ zH^;Pn^e7f8@6o(aaMsp~U+WaZ{<9v);vpHm)(PK^Mz3{>>ulaY$6A;12~W!$YbH9Kz0xZvR82+JD6Q=J}4Z*5fKKmNgT)yrO|NVOZWl@g~K?I z#drKtAUnd)OAlnPF$J>89LTykko5!t*%S_BQ#p_|&c`T_O=mqgkj>ygHj}h=;p?9C zSepacEDmI^fowJhvN<$aE`0AA#*#)LJI+W`AUmD| z*$EuTP7DOHlQ@v|>47XXPO``8$AptjVeAwRW2bT$i^pSuFm^hJu`{S07rxOK35T)y zMiRr=nHYGayy3~2Rpy`;ffF}*Z*!djB;!zkK6D~AFFpOQq zVeDcKWAT-2;+SwLuXa6*E#xp3kG|-baG4PY(lMcN?3Ja5vG}SsIwo9c3S(CV!r0Xu z#ujrJtDkjo7`rAA#^O7GDU7|rXqk=)*9ODbbsWa7=XJ$lY^f=XE#okD1Lt!XyO9Bh zv70!I-E62&VeE~jF!m->7+W3;W4CY>9L8?tg`kJAH**-fjjPkc*zLhE_7?7h9>!J# z!`MoFCEDB+#=_En+OD_O8F6DEO6#kKWmfc|Vf}G!D4afrk)iKNj^!-P&%xi0;2S-f zpQHFhQ4-F*i|2C!@3oIel}u%{qE3n6BkuAx7Vrh&x#eFioFmCd%n!ur z!o4NjN5}Xy6$5N{bB6d|z4*F#{cR4_ygEU+du zK=W-L5fK6lc@Y8&90-981w;@p^1EYw98DMkJ){P%FtF(HA_*^+@Dd3xm2jbii$pXb z8dxR?ESK;K39pp!DhaQaaIu6-L^L53SR)BsFX0;`yjH^NB)ne2r4lZa7u5zyV55XL zNqDn_ZXqF@}Tf%cBJXgZ=BwQfj`4V2hxNE>LEEEwVeNk)SFA|Jj zROnyScHz)S|02O=`XWJN-%kG`0pF}GzNpnhM&pZGd<{JXnq9dD{fmTdT!a2aLU+Sy zq%UfF@E5gL1S8ffxqdxjjo^qik}J|9)}HLWQ^eYfCH084H%F{}49AncsKp;$=%Ft~ ztWg}X;)}NFi`p1d#M+NNT#i^{Ibw}t1@(wEJ{Ym~=ZMv%N378Jv@M(8f;>4 zY_;9==ezI=1BS+U#bCe^88~363>>g*1`b#@0|zXpfdiJ)z-T%7m$X8Gfax>@gcB+9 z1GeAD57>SK2W-EAU1CugI6=aRf#UA| zE-^YF@EQqUFJa*!8S)j1aFIj>!bK8zy;Om4kwku(lrKCakuN+XfrW>}7{_0@NJ2ok zNCFENN#Js+!7UQrDq-OviSoij(lO1yaFK+-Em8$yjfnh8BVWv+2rLA)NcmO?cQIkA zTd|oKgf$7%Bgni+9TIjLSSTp=xxm6h5?E|=#CrRRFeE}i7!rYn;SpFECV_?F5?FXh z0t*jGVBsM(KW9AOlcgRA7fBQlE|S2)MG{zeNCFEFNnqh22`oIM<_aUU5-ySu5H6Cy z!bK8TxJUvE4@qF*AqgxzB!Pv86o~J~N+XahVc{Z)3WSR!u<(!s79Nto!b1|+@Q{M> z{UoVDpM-NIJXyk1BrH56(V*~<1ZEEj*Pmd_kSY)^lE@b>lEA`65?FXh0t*jGVBsMN ztb0hb{slw0NCFENNnqh332eAXf$|Hb4lI)JVhJxn91`Czl>`bUTqNOT5?(Ih6%t-4 z;Z^h+WHKz+BmZ6R*e=-ELGkc!!&3OiWT||{-U^xnY9qDiL)BU(`1aT%)R6~@98u~h z$k}6$Qb+4x41=*c$YzkEgX2LYXKL*($UcnhmfgBY z9`dcek1%}@+TOP~AmWI9gVpy9pE3sCt(0Q?qQrQzr^2*{w1HON*LFDeXD#;WxQ|t% z^yBsfXnE8=)8hLPhS%B)0ySiW){qIu9<>hx_v7}NU48BJRKQPH_@K#_*H?6n44LTK z{x&#LYad*7+VN=JgE%UUwe|>W$3OHw9Bs8YtnPpEk8J2q%3PubGJWDuv*lxY7AZ>I zF*^)<(4HI6?AlPxM&cidS&P53``?V!CRi*H-@#|c?5nz+Zs+1{-%7O zIlgCU{3Tnw=CYK2&z1Oo;-2PxFI6%ow34x?gLCI47B=cQ{R2#X%D$@Vbo3o{Pucx- zPdO5_1C9N<_&?@(=1xH&& zqm7=)iqdLlnxG>^X?TvF?F#)AME`^w5+jEeB7JI&ff912fz$J|_5T%R=MOwYLe5aW zi-Ou$HEq{BRI0EJ`%%M?IlC^jL*w-I=qz@!c%Hw0g#D6wpw1Fu%kF`^Q0Zw9f6}p7 zJqV2R js%nW~`9Q5R^jZT8pB_MB;hQEjbP)6PGTnv3^icY5M?LWr delta 26319 zcmZ`?33yc1^?z@Mc{2lu32VYO32Rt0S!W_3iGYMfNLUjPNn}ezAVFDzK!QsJ6-}ga zKwAOVprs~Qo#2|dRM6TcwN_|dptcpPtzc~hm%smW?s<3S&H28LUp)7B@45G$yS(%6 z@7_DZ@dHEZ_YAI2bJw5nHH_Nru6xBvRfaF?LyPIb5fNVnN;PPxWF_bxBc2RETML?rHTq-O^aFzP7O)q}pioE!VTeWXo zs8eKI$fMeK%Wltxgi5VXJxK=Z2IgUe%Yj3P$snB`M%)Fl&hN*t6H`A^nL7Q7J&Eo< zk*bG1?KeTVrH+ukw)&eZ;4mgbOD`({&i1xa;4Fob`d}!jH--u+R7j!C z6xvLoMhZ3dwi4k~A1eSSDbAOKalSqnmzRWb;XW8wL2(s*tnTo_WNRSoqqwt^G442p zlBQtjB83VuWZnc9HltBnr&vSGo8izDD+E<-p7D`CBv}K!1zy#+G)d@_fdA2_Wh76= zdg^%m6iz89B3Wi*TNoA3S@7LhtJIZiM($0q5`3l`WCw>orG`!;mDVaMw+Iye>{40x}E)-jgP zRX7?Bo%9Z@EmPUCXCCEV(gRgwj_pzeUcr;$G-NxsCMGaZ7H{F`&8g1_R*xlFM2i58LEat=h zbZeCv1~X_aRq10@x78CIZnI7X*9)KZ1o zAdx;;;2{DJ6}XSUN%44rk&F>eJNw2f82tq9FZ2fpJdiMyV%rj(#ky7Wv>fz+1E4a; zN{>tRW9kq%N128=I-{$~l$?o~hCm((ESXVFrZo}5S6F3{q%14NRqpJ=yRiso-S-H* zTi|;Iz7KH<{^_Bsr*znlwkJBH_h57ioZu#%lc_Yx20(Hyy5|&_mTOJKChW?!0;$dh zK7vY4{HV%w0@bjk=O`~05wZ?o-eVyvkm1yQPUL-F;0WP}KWwGDmZ;vaoz_yS1ON0h zUnX$5z}E}BOyK44z!lb~JZ^RsH=zo)cARLfi=y#Q_p4i}ofwDB90HfdSJWq+)zNG6V#CKKr;$BI0fI(7q7O`OfRHP#dsf-qJaFXG5+9G zD-AYN=p==juEzI&Vxcu0&QfR^g;J(sYF#0wmQW}-)#?N<6Wbn^G(H9C( zK-zRHt^Qie1G{HfgU$ZXOo{l=kQfN5Gf@PfVW!m^*3Gn1@fpN$$V`P3BnE+b4Hj~C z1{P9y4Vs@a6GL?v!ofR?N#-y(Lt;22&q9#~#Uw_+E)pZ5jl?LpL}D~#%tkQ=Dp5eu zEG(#LHWs;)LPayMN4}ViJ<>`rr3e!)QmBMN!Pyw9r%(ljj#KC$hV~X?;3P$bDU?uz zp(+X$QmBzabrd>Hp%w}Sms(y3%&}78Ji($l*phKjGY227@o<>L1UNA?LKo-a znsDTNTob0v!_XoMmCVDEcg@F=cg{n-miegHLZP!1I!md^3-HOA4S5T&FN>gh0h(0| zhX~DqQ-tP%e<6x_kVj%ZR4&Bsu3CVFG%dtJ4lb~KaB?BGy9CS<%vK6{C8)RvDhVxy zLxiq{4nj-7Uy9;7C?K&EYDknpb1ABn!x=)jv|fZ2_)4*YvPD?za@d6txTbC+u@Wwk zSOpo2QB*)BiAp#`;s$8G4p+NhEXLLDYDl~mqi%$0B-X$V5^LcY3OKkJhk6MP;YkcY z7(-??lrKSr^-z5cmhg)uSi()vK~Xn@|2h;Kpn$|is3Eb5Y6_W~;T(w?NL`9z3zU+$ z1sX_fg%c#Ufmw!PJLI8&l%-fCHW1O1vTd&7CUqrBpRCZDbE5wHZ6U@6IaT#{? zJy1YmH&m0j7Y>oQ4?0NHgSi|<17wi6AIecc)iNC9eaq3HpF$hOKR|3JeeZINPFsP| zKujnd$|zzdMeK)Wj4&UBGZcS*Ihv8O5~oYb3e+fEi5fqL>Xn#Ow}Nb?q$UcT#*lds z%vIQFhahhimhlL5tR@n(3LEe!9HNxR;1s1i4*m+XViguyvE1qkGb@mMJqeW<2^Fib zw8jeTgM*ZQ5<})wU{+$1uL6_uE3wKjg{ml2L7{^b+DD=GO3c_!A>R#F%HB#0gfRd? z3RO_3j6(Y;R9}hZx87g{&0oQV8_>Gv$l#tZVKq9)=b?NxMnzyhi55x;na99?BZ?QG zfW(VXL*gZ9Ch;BPu0ioCl#*zL1`=&>g2ZcJu0`=WA;1Y@7Kt>gcccGHRZ{ZLM*trg;)VV60QmyN-eJR!04Hwsu zJE^vYL9hzr>M5?G3gh0UxPutCcRfa(q{J|V5~?v&MWMoKEbj1nEUu11XDHM{sYy3s z>UoMQz6sa&e}`{(a)h7ymf#I=?mGB%>WI}61dG3PnhMdEX4LjjE&umImC zEZ_^s*n|;D8!_VaYV`26o3N;V!6Ay>xe=TG`6g?ac^(osWA+O$jl@^5gT%k#7>Tcm zd56sZ(2VE-<7;s4{ujz?Fn{4@EVZ!)tGNgr6wyKvNn5BxDAZJiFW327tih00qq?>e z+_MEA>M}&oN^1Kkq||;Owe}k9#*ACAYkgZVq2d-S+JJ^zu;|TOFzFN~;cCR(iuSl6 zZ!11HouHD02li8>1s#OEU~WU?gFF&`sNRN=o#7CnE^rD3v~R_!p0pjO`eiU}J0^F7 zn(Y{0wGHE2C_VwsVT73oskNBg9g1ra^?+R@dO{nCUT_HooZXJL71W{GDYclpst(Og zg8g-vEg4Rb=nI!f^n(#QQ1pjV5(8itiGgs8L;%i_NP*;A@xe~pfkl?yiUT|dYB0ha z42MY!fm0-KA$S{#VUR~+I8>5IgZ(5%zzGr~;SvhhC=;$Y>KT|k+QbzeempFU;MmNu zK@5+9^4qOGeaHTfN*2pvAh27S`Nd4{B7p`t&P)lzd7(uw}PHUDcVVBx7 zfTq_xzUOyZ2`1#<d6{KGw_VmE zX!ZD0{4MvfiZ8O`r0a5*E4&*XOt)5|C-1o1S_wQ4#2;HQFM`$l3_kLdzoZgLd z$OUajJpr8QXO4RM;7m{5jq}07$bxE$^fL0n;d?R74<|@;=6qc^tt;odj8QkvcR8a3 zNWCBZ%YpkWKTh<5`!REO&fJ3)dUEDo3c>k(-Y_&aSc$%jaAr1cxhBJ*-Bur*-iPWj zYd>fs(I3u`7@!J*v-es1eZkCN7-!oMXluZfp>T%8FfggY;bcPqXRoS$1Xn+js~^SH zk7hK6Z6C`h9n3wLF9@k5GN6D&CTGuLd+Xl!;#++;ZfqWW-&+UapW^fw#~S0gkO`^+ zsM~{Yzs*NI3Amo6&W83qmM_tslg3qQY8u8>oOKdsoy@jP;SBkS7Iw{UtY8`(djKt& z4yQ=W;5;)q&ox|K$^&+FP;?UKYRg`$n>*8(1I>G}Gv{)Sd7NWD*Rp`oLhgYQg_3b| zPklPsa@mR-Z=~QNRRYxQ#agKKxL8#)2={?$&MZ}EFQBVR+h-*t+mE2y=A-QE)~_mG zyRLHi^-#PITd<7nSbJmxFOV{!T(fzE3h6k}RH^Siu z(PyvOv%^2!T)U^)mu{}xQ|wPStM(kmnCd+p7+$}}>^vM-sny@$Y`+=bSl$4xQk&mL zv=K6H_x80yd3m|UCP;k6lkCmT$;rwzHZ#b{$jQw#YT(R^m}d)^52CmQGJb_ww?e~3 z6x-m!2cEuGFefkD*bW_k#Hd=R{@BycL&elVbp%s0^1@+b2jxrl=H+DuvyEH9d_UyhF<)gCMLM3e7Msyp zyfEMm&tNDyY&{l@>#PhJpzWglUu}GJix#S%vSvhcbd5LtGG%(>5jH*LnRwHSV8;<_ zmFsMC>l zS`f811zz^}2m7mDjW_ck9C{i1HvBe>e8OsVx3pR?;7)5{)YMdmsV&FtSL_ML{3wew zziQ2OmFmH#(+lTG-*CigpzM>tVFC{pn6~qr`a>g2UbDQOXgdlbYu>V^#^z&b{7f3e zV;nwVP49fMjoVsgz&($8ilRk~h#&KjSflR7vKKi^9TdOqn69JZD;W(-f>wXmyFBv2 z-*Fy$g@CNT{9WwQiyhphC9jDt9fS4h(cj$8UE1ExU0Qa~@<*zF=bW`;VRskr8n~*% zS`=+TW_$~>94$zBhpown#NS&pXc~Nc!Z8hU;N{<2+uZwJXQO)QVas(Gl>mp|vqHre z$8t&fwHIA1rE@%;`O@d?*YSD@|FP4*VR@0&j276D;vJrR@$Go^n89r+-h~bAMNoxP zKibrgddkvHa$g@j#hGg$DcY0?bC8I+n zb`kZFGpDe#shfV=ZjFTeH$2nf=${>3JubehX)jk#mgun(?D&wo`r!^oS6|8g_!G9g z>`c7n_0aGMTAub-K&RH``n8xOvcWcS zJjd0J7F2xBEolAR(Sn3X`8j6`rox&we6uul@lJQO{{z17;w_0z&l!%w@KO1PHEx6+ z zYiPS04xYC<`F#VJqHJhxd&q>cw{Smd=2zCvX-#)>JiXy5r^fW0IdP7_d5DKO^v5@} zy{I0D=s3>AR6PR|{zhdP$bAaF<{9|gPaQLG77ySz_5faDW&P~n0Dglr@a%W&r;IPe z0M5pO^%Si5)*irb;|K7Pa{!Cj_afDr-Z}rDTv>|ny>sIqCptckHb(mmFu!+9%K7n= za)D}7#gE*k(>6oYZAyT|A8}GPy1nquyErNTDI5Zwn&>vAxx8@qoz^w(Iv3t{8Nd#q zJ(6mA9m|~(eykdg^J5itON-*$LB}k*9qr$9)6YYj+Z$W*TpQnxC2-_>e24hZ(C@9W zK0i+v-466?mUmrkNsx=uF4(aY#0m|A&R*$r_OJ1J(RuM1$LHI6jkamJyr`-F$9k=* zG{d;KTjBABR*b$EZK_*jCscG|yR}}ujhig&(rr-nBkoJ>@Oj(Z2mi$*5q+Ys!>aU1 z1fkjQosH{dv$J=&`=rm?+;$m~W9MsNsd@|IHraPwyrt1mSf=_kx*%GPW$ICISfW@rhH6w%3q^b3 zRE4-7WwY@*4ATjv#%nN4S3@*ji(xv^)VLDEbQMJ78!${2YrGo6xcu!LjkposeZ(^{ z+Slvh$YJjq_s*Wk0RG}x*hhOlvQzhsZ;Tfn3gEqtS4zE|-M$CuxynjG1^SeY;dbh| zveU+m=^THY)r`6o>T#@l%;R&PFIVBRMP8rVw-mz<)Y9To*G+wLxdT(5I5G8s6B8YF zV(K#|rg3*->N6+CKI7{%+5qY_r$U*)Q9#h74nqz?%iG5qOKh zw+Osd;B8S{%WvxK(Fn$57PVNns7~M=0^cg|Z35peFukx;`L*`z=O;SC#&-(5Q{Y_! z-(_R%Sm>?b95R={ZX4I?44s66M_}p~XMyyRablmqeu3#dC>r z#?#%Km5W=?h6}28w>XB%#)ejfLLJ5dCk(iDG)TgUXA?03J@_c67k(3LW2}xQzFhgE<-DkkJhi|AcXuL&u9)a48fXxC1|uBKmpq(DqRriFroDj@MCn3>-RyVk~t}z)YtNcRCxw zEqgRMFDuu`piO@|AW~6TA?&Cu9+K>=te}x?Z~o^7bAv_>9C`$^=dx)TXqyqDUJBs7 zUF?>Cna3%a;f%0x1(k?a&S;p0ND?)f6Hx!0Rz}FUh6~Ee$_pE_>|9}- zqO;lloUCB3QAFwiyjP`;N`hJ0K{~g>GnxRNTdAJS4(Ei7dEER^I6Kdn&qZa2GP8{Z zJclyqBxE7$hl9b8QDXOJE>4nCE<2o)8Oo)jtVhwj?98CC7>e(}YOduX@`5>GV~K5X zC_CG@j>4#k55rQsv>ZHoDdSplbF%V`az2vDp+;TLR%L~-68!Q5&8uAOq2;U|!UIcV zg>6W7F6yl0gq#pgk5zV=!His^f}fepOpLGO@Vys2 zU-2Y?;_3R8upX8?irX4NFWmAdZfj)FTNhs#e0c6Yo{s!5-CE%;%l5*kC#-Ey-{!f> zop9cQ37qpj$p0U&?_#D>o5ns zIyBZRHjVWXPh-8p(^xMdHP%Z=jqN35ZL|WthSUl~)SU*b7g#TXwZ2{iYpfT+8tXN% z#=Hi`X6XpM2-dhvR9G(X^#U&wc)7qU1YSv)x>Z-WN+?taTq*Dk0&8}!%hTHj8td%? zjcL0aoVT!<43p`ceLV>3VJYC=! zdVfM2pqZG)nu%$wnV812MS*(%LZ|Ef3yt;ug~oH`V>M3{IA7oe0xuM}MBq|^7YSUu zSVUYa@DhQq6L_hOwFlDAxyG6*YOEh~8`m1zk!S_&NHo?xq;Z`vz!caeux4Vq%QX>W zjHP=-2F=8D26`tr9g${YI$bj{jWrX~*k)qU$6OOJl}@eGOiX9cOiW{{+-ZPjVme(j zF^x47)0ml9%+F{dM(I?)+Lv`!pqZF1Kr=CoH51cVGck=d6Vq5RF|0>pO~eqZ`qkHQ zoCXXOEe;5rB5i7@&JZ|L;4Fc&Z5)07=Lm&dfkOg^1E4XZ9e`FPRGm!SN73957K z5>yvj`Vv7^m!NEd%9o(JQ5{53`IeTt1eL%mVs!~BQC))a(9*v+&e~|`!IP<;zg<=&&e_J+?UC7KPJyeP>DPbVDdbW8^Yu{z~nh4MxIl7{jbRL zAe;9Rc^=H&`ta}JYdo1rT593tz8p(^s6$K?5nD0v#8HYoPv#*}rOrB>_w?85 zSLC_U?oT4mH*ncZo>#}n^NmcN*KiSvJg?=Z=Mi~cM`0$^|3Ohulz^3XAP zwl8xk^1PkxSLC^t3}W(J$8#V=F(jSE2sNY z?ksk?53tkS8*{q&l_YYy``GF3XJvM}4^kZg^C8J*k}T&tp#aVRpI)qE7c9 z^>P5eo1~oXBQdA@C_CN9xCfNeeLUuLpJ1o^Bs<+kYDU03%ue?ywvCAuINDX04fcDnDg)BU6USksQ>2UH?E-ILTU%ITh>;_MyEKiO80)BQ6$-48jx za=IT;rv&f|P0Hzh!hNZn?rH8WcDiR6u+#ku52JFrpW3Y;r~6lSx_{&1mDByZtx8V! zGj_UX9ZvUL%;|p41u3Wd4?7n*-7naF<#hkaPWNB-%v4VIOUf58&$F7^vAn>|S5EgU zE=oDwfAbttPWNlpS5Egoc7Kx7{V$izPWPLb)4j+}_YxPOobI=_#pHCqqcA(&@9oma z>HfgAD5v{>{3t7@`=hd|6FFVD?+(nEhfU3eS6}zc_v=;iZFHq>g}YJjuj%;PajmF# zul6N)VRf=K0n|R8zNglYZ!2jSfFoz{c45;>-Ujmr*YPsH8>?w}IqYuqu5br0@P1xY zEg@bl(cV<6<E02sD_x#ugT_|)2{Ck1l6Zj7Tzc26~1^z(pyqy#Zrv(0!z<(C_ zLxDdM_+x=Tq4B3#tLMOJfzJs17lA(&_^$&0P2j%^T>F`bI4kfufj<}c9|HeG;5!B0 z8INm?UGWjdT>{@N@I3-g# zd3!2^fn*#q@;I{-m&N#O7cEY|D(!)~R zSM9wed&7$Ry6v?n9UOPEmv+Q`JxpB`_qBMLr?@X}#?elZ;=Vp!K{NOD^Ez8`-_E>@ zRNS`fZMLPZwfD-nfs%zfuXxi8)a(YJNxMY->M=DrJ<`z~beTf*G8l%Ed8eHStJ#XBE#fk$!QYk5c% z_r;4J>Z`hn`(DS~*S__kxNjMA-*V=@xZxJvaKyVFn)@!NZc*IV-f>g6b?l2CnZ$ip zG54+D{EGWlQl|v)Th5C6uI9c}-1kQAF6O>#7%=x$S37V+k8bOzoi^-!;=WbPeXF^6 z#eLWFh=+*#-o)JZW(W7(5aYfZxjPj1-DKw??rZO_Dehat+;@vTGZpv6+aJVzw{p)a z?u%DIi25t;i(iqZDW|wEeovZQk>bAi1xXx=B=hwLuF6UMO&hz@-8&(v(LRxL7D$EASG5uM>Ewz-0oL3w*uo;$=c%xxgy~UMcV@ zfhz>A6u9;V5wTj}8wFk?@LGY_30x&`bv&*$*2hN}Hwk>Rz#9bKDDWnM`8khGV*Qx& zkYoSr=UmSL?MO7%jznYacr@0INn`D}H0~=5>gU8X{{5X17#<+-K!F1SrwE)X@F0N) zJ22I+egM@Q(j%u=S_LM3Za&bv?=fB<&xTz z@5~%SZOSY1WuJuNr3ihE_%c>kY_J=zzL*VO&I#%yG{G*D*r0uXU$MdNT)tw1cts-0 z27B_FlG$J{UU@1u*qiNFY_Jcl-J#eQ1otCP@VjwUwPm)T%H=6`r;f@t6XM##|! z1sJ6$1gDSUk5;MQ`WdX^A^5BZuM*R*CE+hQk^V5wHe8`(@-OtE^#gsDS1zNsf-z!G zxvx($c^CS~)KMyxTnl|<>KK(uj)gulHC?5WTcM9k%}}Z2ROmyC7W%ARw|-4IzAm#M zm40wPn_H5@Kfj;LKgJ*8t_pLz$V<>imr$~F!3O3AlVQgYNy>LN;N0t`O@2hVu7xbwf_V;fh~ zoiMi1yC9&eqgRr?GF{S*ee7mBHrnl-M#{4edlwAX%JhCR@eZJgl3uR#(Y7WX@Twmt zYHj2~66jvI*9EOdygpaO!|?VI?{z@uPy=*n^fJ@A^PlEiRnYLLXDZ}h@OJWRy|jiU z{rrW$WQEp9-e=y+@t617Jl&!5-@Fx_o1Wnm<6HQ!%~KTZ$RUp6TOanq)9+cMVAhxT z<>jU)RVN)h?ZR&!KgHM0v6obNy`kjq-Z}23!#?<^zjtBOh+*nvDplJw91GXo7WXf3 z#2;Xdh`f8jJI>`ge*`*z?Oo|_vYXism(F`)94}GSFoF{Q>HPE{=a2 zMYSSoeK5WkGWZ9Qg0Hj5MKm>kH1{;Q5<9(xQA$}2#+RRZ`oRr8f1&I6S8z{Pe@S#U z(ZWPe*3EBVr*-vDf%fl%7MLh(%s! 1 + (0.0, 0.0, 1.1, 0.0, 0.0, 0.0), + # One value is < 0.0 + (0.0, 0.0, -0.25, 1.0, 0.25, 0.0), + # Sum is > 1.0 + (0.0, 0.1, 1.0, 0.0, 0.0, 0.0), + # Sum is < 1.0 + (0.0, 0.25, 0.25, 0.25, 0.0, 0.0), + # Only all NaN is valid + (float("nan"), 0.0, 0.0, 0.0, 0.0, 0.0), + # Only all NaN is valid + (numpy.nan, 0.0, 0.0, 0.0, 0.0, 0.0), + ], + ) + def test_genetic_ancestry__invalid( + self, + validator_with_adata, + genetic_ancestry_African, + genetic_ancestry_East_Asian, + genetic_ancestry_European, + genetic_ancestry_Indigenous_American, + genetic_ancestry_Oceanian, + genetic_ancestry_South_Asian, + ): + validator = validator_with_adata + # Second organism in adata is not homo sapiens + validator.adata.obs["genetic_ancestry_African"] = [genetic_ancestry_African, float("nan")] + validator.adata.obs["genetic_ancestry_East_Asian"] = [genetic_ancestry_East_Asian, float("nan")] + validator.adata.obs["genetic_ancestry_European"] = [genetic_ancestry_European, float("nan")] + validator.adata.obs["genetic_ancestry_Indigenous_American"] = [ + genetic_ancestry_Indigenous_American, + float("nan"), + ] + validator.adata.obs["genetic_ancestry_Oceanian"] = [genetic_ancestry_Oceanian, float("nan")] + validator.adata.obs["genetic_ancestry_South_Asian"] = [genetic_ancestry_South_Asian, float("nan")] + validator.validate_adata() + assert len(validator.errors) > 0 + + def test_genetic_ancestry_same_donor_id(self, validator_with_adata): + """ + genetic_ancestry_X fields must be the same when the donor id is the same + """ + validator = validator_with_adata + original_donor_id_column = validator.adata.obs["donor_id"].copy() + + # Second row should have identical donor id + genetic ancestry values, so this should pass validation + validator.adata.obs.iloc[1] = validator.adata.obs.iloc[0].values + validator.validate_adata() + assert validator.errors == [] + + # Update the genetic ancestry values to be different. This should now fail validation + validator.adata.obs["genetic_ancestry_African"] = [1.0, 0.0] + validator.adata.obs["genetic_ancestry_East_Asian"] = [0.0, 1.0] + validator.adata.obs["genetic_ancestry_European"] = [0.0, 0.0] + validator.adata.obs["genetic_ancestry_Indigenous_American"] = [0.0, 0.0] + validator.adata.obs["genetic_ancestry_Oceanian"] = [0.0, 0.0] + validator.adata.obs["genetic_ancestry_South_Asian"] = [0.0, 0.0] + validator.validate_adata() + assert len(validator.errors) > 0 + + # Change the donor id back to two different donor id's. Now, this should pass validation + validator.adata.obs["donor_id"] = original_donor_id_column + validator.validate_adata() + assert validator.errors == [] + class TestVar: """ From 4befffc49c2b10aa75dcf394b1c29ea040d67c9b Mon Sep 17 00:00:00 2001 From: Brian Raymor Date: Mon, 2 Dec 2024 15:43:12 -0800 Subject: [PATCH 14/28] updated genetic ancestry values (#1141) --- schema/drafts/5.3.0.md | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/schema/drafts/5.3.0.md b/schema/drafts/5.3.0.md index a65f74a8..ba1cd0ab 100644 --- a/schema/drafts/5.3.0.md +++ b/schema/drafts/5.3.0.md @@ -583,10 +583,9 @@ If organism_ontolology_term_id is "NCBITaxon:9606" for Value - str or float. All observations with the same donor_id MUST contain the same value.

    + float. All observations with the same donor_id MUST contain the same value.

    If organism_ontolology_term_id is NOT - "NCBITaxon:9606" for Homo sapiens, then the - value MUST be "na".

    If + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan").

    If organism_ontolology_term_id is "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0010" for African expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 @@ -610,10 +609,9 @@ If organism_ontolology_term_id is "NCBITaxon:9606" for Value - str or float. All observations with the same donor_id MUST contain the same value.

    - If organism_ontolology_term_id is NOT - "NCBITaxon:9606" for Homo sapiens, then the - value MUST be "na".

    If + float. All observations with the same donor_id MUST contain the same value.

    + If organism_ontolology_term_id is NOT + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan").

    If organism_ontolology_term_id is "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0009" for East Asian expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 @@ -637,10 +635,9 @@ If organism_ontolology_term_id is "NCBITaxon:9606" for Value - str or float. All observations with the same donor_id MUST contain the same value.

    + float. All observations with the same donor_id MUST contain the same value.

    If organism_ontolology_term_id is NOT - "NCBITaxon:9606" for Homo sapiens, then the - value MUST be "na".

    If + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan").

    If organism_ontolology_term_id is "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0005" for European expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 @@ -664,10 +661,9 @@ If organism_ontolology_term_id is "NCBITaxon:9606" for Value - str or float. All observations with the same donor_id MUST contain the same value.

    + float. All observations with the same donor_id MUST contain the same value.

    If organism_ontolology_term_id is NOT - "NCBITaxon:9606" for Homo sapiens, then the - value MUST be "na".

    If + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan").

    If organism_ontolology_term_id is "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0013" for Indigenous American expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 @@ -691,10 +687,9 @@ If organism_ontolology_term_id is "NCBITaxon:9606" for Value - str or float. All observations with the same donor_id MUST contain the same value.

    + float. All observations with the same donor_id MUST contain the same value.

    If organism_ontolology_term_id is NOT - "NCBITaxon:9606" for Homo sapiens, then the - value MUST be "na".

    If + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan").

    If organism_ontolology_term_id is "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0017" for Oceanian expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 @@ -718,10 +713,9 @@ If organism_ontolology_term_id is "NCBITaxon:9606" for Value - str or float. All observations with the same donor_id MUST contain the same value.

    + float. All observations with the same donor_id MUST contain the same value.

    If organism_ontolology_term_id is NOT - "NCBITaxon:9606" for Homo sapiens, then the - value MUST be "na".

    If + "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan").

    If organism_ontolology_term_id is "NCBITaxon:9606" for Homo sapiens, then the value MUST be a float("nan") if unavailable; otherwise, the value MUST be the genetic ancestry percentage of "HANCESTRO:0006" for South Asian expressed as a float greater than or equal to 0.0 and less than or equal to 1.0 From 0a166c41342d876263b0f3a17a8526448167bdda Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Tue, 3 Dec 2024 10:05:19 -0500 Subject: [PATCH 15/28] feat: cellxgene-schema must update validation for X (Matrix Layers) for descendants of Visium (#1133) Co-authored-by: Evan Molinelli --- .../cellxgene_schema/validate.py | 64 +++++++++-- .../tests/test_schema_compliance.py | 107 +++++++++++++++--- cellxgene_schema_cli/tests/test_validate.py | 6 +- 3 files changed, 144 insertions(+), 33 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 01a2d140..7ce2d44a 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -24,19 +24,24 @@ ONTOLOGY_PARSER = OntologyParser(schema_version="v5.3.0") ASSAY_VISIUM = "EFO:0010961" +ASSAY_VISIUM_11M = "EFO:0022860" ASSAY_SLIDE_SEQV2 = "EFO:0030062" VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 4992 +VISIUM_11MM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 14336 SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE = 2000 SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM = 4000 CONDITION_IS_VISIUM = "a descendant of 'EFO:0010961' (Visium Spatial Gene Expression)" +CONDITION_IS_VISIUM_11M = f"'{ASSAY_VISIUM_11M} (Visium CytAssist Spatial Gene Expression, 11mm)" CONDITION_IS_SEQV2 = f"'{ASSAY_SLIDE_SEQV2}' (Slide-seqV2)" - ERROR_SUFFIX_SPATIAL = f"obs['assay_ontology_term_id'] is either {CONDITION_IS_VISIUM} or {CONDITION_IS_SEQV2}" ERROR_SUFFIX_VISIUM = f"obs['assay_ontology_term_id'] is {CONDITION_IS_VISIUM}" -ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE = f"{ERROR_SUFFIX_VISIUM} and uns['spatial']['is_single'] is True" +ERROR_SUFFIX_VISIUM_11M = f"obs['assay_ontology_term_id'] is {CONDITION_IS_VISIUM_11M}" + +ERROR_SUFFIX_IS_SINGLE = "uns['spatial']['is_single'] is True" +ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE = f"{ERROR_SUFFIX_VISIUM} and {ERROR_SUFFIX_IS_SINGLE}" ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_FORBIDDEN = f"is only allowed for {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}" ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED = f"is required for {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}" ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_IN_TISSUE_0 = f"{ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE} and in_tissue is 0" @@ -49,7 +54,9 @@ def __init__(self, ignore_labels=False): self.schema_def = dict() self.schema_version: str = None self.ignore_labels = ignore_labels - self.visium_and_is_single_true_matrix_size = VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE + self._visium_and_is_single_true_matrix_size = None + self._hires_max_dimension_size = None + self._visium_error_suffix = None # Values will be instances of gencode.GeneChecker, # keys will be one of gencode.SupportedOrganisms @@ -77,6 +84,44 @@ def adata(self, adata: anndata.AnnData): self.reset() self._adata = adata + @property + def visium_and_is_single_true_matrix_size(self) -> Optional[int]: + """ + Returns the required matrix size based on assay type, if applicable, else returns None. + """ + if self._visium_and_is_single_true_matrix_size is None: + # Visium 11M's raw matrix size is distinct from other visium assays + if bool( + self.adata.obs["assay_ontology_term_id"] + .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True)) + .any() + ): + self._visium_error_suffix = f"{ERROR_SUFFIX_VISIUM_11M} and {ERROR_SUFFIX_IS_SINGLE}" + self._visium_and_is_single_true_matrix_size = VISIUM_11MM_AND_IS_SINGLE_TRUE_MATRIX_SIZE + elif self._is_visium_including_descendants(): + self._visium_error_suffix = f"{ERROR_SUFFIX_VISIUM} and {ERROR_SUFFIX_IS_SINGLE}" + self._visium_and_is_single_true_matrix_size = VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE + return self._visium_and_is_single_true_matrix_size + + @property + def hires_max_dimension_size(self) -> Optional[int]: + """ + Returns the restricted hires image dimension based on assay type, if applicable, else returns None. + """ + if self._hires_max_dimension_size is None: + # Visium 11M's max dimension size is distinct from other visium assays + if bool( + self.adata.obs["assay_ontology_term_id"] + .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True)) + .any() + ): + self._visium_error_suffix = ERROR_SUFFIX_VISIUM_11M + self._hires_max_dimension_size = SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM + elif self._is_visium_including_descendants(): + self._visium_error_suffix = ERROR_SUFFIX_VISIUM + self._hires_max_dimension_size = SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE + return self._hires_max_dimension_size + def _is_single(self) -> bool | None: """ Determine value of uns.spatial.is_single. None if non-spatial. @@ -1228,7 +1273,7 @@ def _has_valid_raw(self, force: bool = False) -> bool: if is_visium_and_is_single_true and x.shape[0] != self.visium_and_is_single_true_matrix_size: self._raw_layer_exists = False self.errors.append( - f"When {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}, the raw matrix must be the " + f"When {self._visium_error_suffix}, the raw matrix must be the " f"unfiltered feature-barcode matrix 'raw_feature_bc_matrix'. It must have exactly " f"{self.visium_and_is_single_true_matrix_size} rows. Raw matrix row count is " f"{x.shape[0]}." @@ -1800,10 +1845,7 @@ def _check_spatial_uns(self): self.errors.append("uns['spatial'][library_id]['images'] must contain the key 'hires'.") # hires is specified: proceed with validation of hires. else: - _assay_term = self.adata.obs["assay_ontology_term_id"].values[0] - _max_size = SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE - if is_ontological_descendant_of(ONTOLOGY_PARSER, _assay_term, "EFO:0022860", True): - _max_size = SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM + _max_size = self.hires_max_dimension_size self._validate_spatial_image_shape("hires", uns_images["hires"], _max_size) # fullres is optional. @@ -1906,20 +1948,18 @@ def _is_visium_including_descendants(self) -> bool: :rtype bool """ _assay_key = "assay_ontology_term_id" - includes_and_visium = False # only compute if not already stored if self.is_visium is None and _assay_key in self.adata.obs.columns: # check if any assay_ontology_term_ids are descendants of VISIUM - includes_and_visium = ( + self.is_visium = bool( self.adata.obs[_assay_key] .astype("string") .apply(lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True)) .any() ) - self.is_visium = includes_and_visium - return includes_and_visium + return self.is_visium def _validate_spatial_image_shape(self, image_name: str, image: np.ndarray, max_dimension: int = None): """ diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 7ee65a6d..88fe4e2c 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -15,11 +15,19 @@ from cellxgene_schema.schema import get_schema_definition from cellxgene_schema.utils import getattr_anndata from cellxgene_schema.validate import ( + ASSAY_VISIUM_11M, + ERROR_SUFFIX_IS_SINGLE, + ERROR_SUFFIX_VISIUM, + ERROR_SUFFIX_VISIUM_11M, ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE, + SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE, + SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM, + VISIUM_11MM_AND_IS_SINGLE_TRUE_MATRIX_SIZE, VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE, Validator, ) from cellxgene_schema.write_labels import AnnDataLabelAppender +from fixtures.examples_validate import visium_library_id schema_def = get_schema_definition() @@ -77,7 +85,8 @@ def validator_with_spatial_and_is_single_false(validator) -> Validator: @pytest.fixture def validator_with_visium_assay(validator) -> Validator: validator.adata = examples.adata_visium.copy() - validator.visium_and_is_single_true_matrix_size = 2 + validator._visium_and_is_single_true_matrix_size = 2 + validator._hires_max_dimension_size = None return validator @@ -253,7 +262,7 @@ def test_raw_values__contains_zero_row_in_tissue_1_mixed_in_tissue_values(self, Raw Matrix contains a row with all zeros and in_tissue is 1, and there are also values with in_tissue 0. """ - validator = validator_with_visium_assay + validator: Validator = validator_with_visium_assay validator.adata.X[1] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) validator.adata.raw.X[1] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) validator.validate_adata() @@ -298,39 +307,101 @@ def test_raw_values__contains_some_zero_rows_in_tissue_0(self, validator_with_vi validator.validate_adata() assert validator.errors == [] - def test_raw_values__invalid_visium_and_is_single_true_row_length(self, validator_with_visium_assay): + @pytest.mark.parametrize( + "assay_ontology_term_id, req_matrix_size, image_size", + [ + ("EFO:0022858", VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE, SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE), + ( + "EFO:0022860", + VISIUM_11MM_AND_IS_SINGLE_TRUE_MATRIX_SIZE, + SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM, + ), + ], + ) + def test_raw_values__invalid_visium_and_is_single_true_row_length( + self, validator_with_visium_assay, assay_ontology_term_id, req_matrix_size, image_size + ): """ Dataset is visium and uns['is_single'] is True, but raw.X is the wrong length. """ - validator = validator_with_visium_assay - validator.visium_and_is_single_true_matrix_size = VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE + validator: Validator = validator_with_visium_assay + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + + # hires image size must be present in order to validate the raw. + validator._visium_and_is_single_true_matrix_size = None + validator._hires_max_dimension_size = image_size + validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = numpy.zeros( + (1, image_size, 3), dtype=numpy.uint8 + ) validator.validate_adata() - assert validator.errors == [ - f"ERROR: When {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}, the raw matrix must be the " - "unfiltered feature-barcode matrix 'raw_feature_bc_matrix'. It must have exactly " - f"{validator.visium_and_is_single_true_matrix_size} rows. Raw matrix row count is 2.", - "ERROR: Raw data may be missing: data in 'raw.X' does not meet schema requirements.", - ] + if assay_ontology_term_id == ASSAY_VISIUM_11M: + _errors = [ + f"ERROR: When {ERROR_SUFFIX_VISIUM_11M} and {ERROR_SUFFIX_IS_SINGLE}, the raw matrix must be the " + "unfiltered feature-barcode matrix 'raw_feature_bc_matrix'. It must have exactly " + f"{validator.visium_and_is_single_true_matrix_size} rows. Raw matrix row count is 2.", + "ERROR: Raw data may be missing: data in 'raw.X' does not meet schema requirements.", + ] + else: + _errors = [ + f"ERROR: When {ERROR_SUFFIX_VISIUM} and {ERROR_SUFFIX_IS_SINGLE}, the raw matrix must be the " + "unfiltered feature-barcode matrix 'raw_feature_bc_matrix'. It must have exactly " + f"{validator.visium_and_is_single_true_matrix_size} rows. Raw matrix row count is 2.", + "ERROR: Raw data may be missing: data in 'raw.X' does not meet schema requirements.", + ] + + assert validator.errors == _errors - def test_raw_values__multiple_invalid_in_tissue_errors(self, validator_with_visium_assay): + @pytest.mark.parametrize( + "assay_ontology_term_id, req_matrix_size, image_size", + [ + ("EFO:0022858", VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE, SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE), + ( + "EFO:0022860", + VISIUM_11MM_AND_IS_SINGLE_TRUE_MATRIX_SIZE, + SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM, + ), + ], + ) + def test_raw_values__multiple_invalid_in_tissue_errors( + self, validator_with_visium_assay, assay_ontology_term_id, req_matrix_size, image_size + ): """ Dataset is visium and uns['is_single'] is True, in_tissue has both 0 and 1 values and there are issues validating rows of both in the matrix. """ validator = validator_with_visium_assay - validator.visium_and_is_single_true_matrix_size = VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE + + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + # hires image size must be present in order to validate the raw. + validator._visium_and_is_single_true_matrix_size = None + validator._hires_max_dimension_size = image_size + validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = numpy.zeros( + (1, image_size, 3), dtype=numpy.uint8 + ) validator.adata.X = numpy.zeros( [validator.adata.obs.shape[0], validator.adata.var.shape[0]], dtype=numpy.float32 ) validator.adata.raw = validator.adata.copy() validator.adata.raw.var.drop("feature_is_filtered", axis=1, inplace=True) validator.validate_adata() - assert validator.errors == [ - f"ERROR: When {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}, the raw matrix must be the " - "unfiltered feature-barcode matrix 'raw_feature_bc_matrix'. It must have exactly " - f"{validator.visium_and_is_single_true_matrix_size} rows. Raw matrix row count is 2.", + if assay_ontology_term_id == ASSAY_VISIUM_11M: + assert ( + validator.errors[0] + == f"ERROR: When {ERROR_SUFFIX_VISIUM_11M} and {ERROR_SUFFIX_IS_SINGLE}, the raw matrix must be the " + "unfiltered feature-barcode matrix 'raw_feature_bc_matrix'. It must have exactly " + f"{validator.visium_and_is_single_true_matrix_size} rows. Raw matrix row count is 2." + ) + else: + assert ( + validator.errors[0] + == f"ERROR: When {ERROR_SUFFIX_VISIUM} and {ERROR_SUFFIX_IS_SINGLE}, the raw matrix must be the " + "unfiltered feature-barcode matrix 'raw_feature_bc_matrix'. It must have exactly " + f"{validator.visium_and_is_single_true_matrix_size} rows. Raw matrix row count is 2." + ) + + assert validator.errors[1:] == [ "ERROR: If obs['in_tissue'] contains at least one value 0, then there must be at least " "one row with obs['in_tissue'] == 0 that has a non-zero value in the raw matrix.", "ERROR: Each observation with obs['in_tissue'] == 1 must have at least one " @@ -496,7 +567,7 @@ def test_column_presence_in_tissue(self, validator_with_visium_assay, assay_onto assert validator.errors == [] else: assert validator.errors == [ - "obs['in_tissue'] is only allowed for obs['assay_ontology_term_id'] is a descendant of 'EFO:0010961' (Visium Spatial Gene Expression) and uns['spatial']['is_single'] is True." + f"obs['in_tissue'] is only allowed for {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE}." ] @pytest.mark.parametrize("reserved_column", schema_def["components"]["obs"]["reserved_columns"]) diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 9ab30177..9ea024b1 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -364,7 +364,7 @@ def test__validate_spatial_visium_ok(self): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() - validator.visium_and_is_single_true_matrix_size = 2 + validator._visium_and_is_single_true_matrix_size = 2 # Confirm spatial is valid. validator.validate_adata() assert not validator.errors @@ -384,7 +384,7 @@ def test__validate_spatial_visium_dense_matrix_ok(self): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() - validator.visium_and_is_single_true_matrix_size = 2 + validator._visium_and_is_single_true_matrix_size = 2 validator.adata.X = validator.adata.X.toarray() validator.adata.raw = validator.adata.copy() validator.adata.raw.var.drop("feature_is_filtered", axis=1, inplace=True) @@ -1141,7 +1141,7 @@ def test__validate_embeddings_non_nans(self): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() - validator.visium_and_is_single_true_matrix_size = 2 + validator._visium_and_is_single_true_matrix_size = 2 # invalidate spatial embeddings with NaN value validator.adata.obsm["spatial"][0, 1] = np.nan From 5293502f26362d940ab9ef754b57ba1083f8c770 Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Tue, 3 Dec 2024 13:47:01 -0500 Subject: [PATCH 16/28] fix: handle both string and category encoded "assay_ontology_term_id" (#1142) Co-authored-by: Evan Molinelli --- .../cellxgene_schema/validate.py | 18 ++++++++++-------- .../tests/test_schema_compliance.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 7ce2d44a..434983bc 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -1648,8 +1648,10 @@ def _validate_spatial_cell_type_ontology_term_id(self): return # Validate all out of tissue (in_tissue==0) spatial spots have unknown cell ontology term - is_spatial = self.adata.obs["assay_ontology_term_id"].apply( - lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True) + is_spatial = ( + self.adata.obs["assay_ontology_term_id"] + .apply(lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True)) + .astype(bool) ) is_not_tissue = self.adata.obs["in_tissue"] == 0 is_not_unknown = self.adata.obs["cell_type_ontology_term_id"] != "unknown" @@ -1676,9 +1678,9 @@ def _validate_spatial_tissue_position(self, tissue_position_name: str, min: int, not (self.is_visium_and_is_single_true) or ( ~( - self.adata.obs["assay_ontology_term_id"].apply( - lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM, True) - ) + self.adata.obs["assay_ontology_term_id"] + .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM, True)) + .astype(bool) ) & (self.adata.obs[tissue_position_name].notnull()) ).any() @@ -1697,9 +1699,9 @@ def _validate_spatial_tissue_position(self, tissue_position_name: str, min: int, tissue_position_name not in self.adata.obs or ( ( - self.adata.obs["assay_ontology_term_id"].apply( - lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM, True) - ) + self.adata.obs["assay_ontology_term_id"] + .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM, True)) + .astype(bool) ) & (self.adata.obs[tissue_position_name].isnull()) ).any() diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 88fe4e2c..616da096 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -562,6 +562,7 @@ def test_column_presence_in_tissue(self, validator_with_visium_assay, assay_onto # reset and test validator.reset() validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + validator.adata.obs["assay_ontology_term_id"] = validator.adata.obs["assay_ontology_term_id"].astype("category") validator._validate_spatial_tissue_position("in_tissue", 0, 1) if is_descendant: assert validator.errors == [] @@ -632,6 +633,22 @@ def test_assay_ontology_term_id(self, validator_with_adata, assay_ontology_term_ ] assert validator.errors == [self.get_format_error_message(error_message_suffix, error)] + def test_assay_ontology_term_id__as_categorical(self, validator_with_visium_assay): + """ + Formally, assay_ontology_term_id is expected to be a categorical variable of type string. However, it should work for categorical dtypes as well. + """ + validator: Validator = validator_with_visium_assay + + # check encoding as string + validator._check_spatial_obs() + assert validator.errors == [] + validator.reset() + + # force encoding as 'categorical' + validator.adata.obs["assay_ontology_term_id"] = validator.adata.obs["assay_ontology_term_id"].astype("category") + validator._check_spatial_obs() + assert validator.errors == [] + def test_cell_type_ontology_term_id_invalid_term(self, validator_with_adata): validator = validator_with_adata validator.adata.obs.loc[validator.adata.obs.index[0], "cell_type_ontology_term_id"] = "EFO:0000001" From bc905f4cfa7eb70955e411eaf8126996f30e41dc Mon Sep 17 00:00:00 2001 From: Trent Smith <1429913+Bento007@users.noreply.github.com> Date: Tue, 3 Dec 2024 11:19:15 -0800 Subject: [PATCH 17/28] fix: clean up the cli (#1108) --- cellxgene_schema_cli/cellxgene_schema/cli.py | 37 +++++++++---------- .../cellxgene_schema/validate.py | 4 -- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py index 33fce82b..1254a7ba 100644 --- a/cellxgene_schema_cli/cellxgene_schema/cli.py +++ b/cellxgene_schema_cli/cellxgene_schema/cli.py @@ -1,7 +1,10 @@ +import logging import sys import click +logger = logging.getLogger("cellxgene_schema") + @click.group( name="schema", @@ -9,11 +12,13 @@ short_help="Apply and validate the cellxgene data integration schema to an h5ad file.", context_settings=dict(max_content_width=85, help_option_names=["-h", "--help"]), ) -def schema_cli(): - pass +@click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True) +def schema_cli(verbose): + logging.basicConfig(level=logging.ERROR) + logger.setLevel(logging.DEBUG if verbose else logging.INFO) -@click.command( +@schema_cli.command( name="validate", short_help="Check that an h5ad follows the cellxgene data integration schema.", help="Check that an h5ad follows the cellxgene data integration schema. If validation fails this command will " @@ -31,27 +36,25 @@ def schema_cli(): type=click.Path(exists=False, dir_okay=False, writable=True), ) @click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True) -@click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True) -def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose): +def schema_validate(h5ad_file, add_labels_file, ignore_labels): # Imports are very slow so we defer loading until Click arg validation has passed - - print("Loading dependencies") + logger.info("Loading dependencies") try: import anndata # noqa: F401 except ImportError: raise click.ClickException("[cellxgene] cellxgene-schema requires anndata") from None - print("Loading validator modules") + logger.info("Loading validator modules") from .validate import validate - is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose) + is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels) if is_valid: sys.exit(0) else: sys.exit(1) -@click.command( +@schema_cli.command( name="remove-labels", short_help="Create a copy of an h5ad without portal-added labels", help="Create a copy of an h5ad without portal-added labels.", @@ -61,24 +64,24 @@ def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose): def remove_labels(input_file, output_file): from .remove_labels import AnnDataLabelRemover - print("Loading dependencies") + logger.info("Loading dependencies") try: import anndata # noqa: F401 except ImportError: raise click.ClickException("[cellxgene] cellxgene-schema requires anndata") from None - print(f"Loading h5ad from {input_file}") + logger.info(f"Loading h5ad from {input_file}") adata = anndata.read_h5ad(input_file) anndata_label_remover = AnnDataLabelRemover(adata) if not anndata_label_remover.schema_def: return - print("Removing labels") + logger.info("Removing labels") anndata_label_remover.remove_labels() - print(f"Labels have been removed. Writing to {output_file}") + logger.info(f"Labels have been removed. Writing to {output_file}") anndata_label_remover.adata.write(output_file, compression="gzip") -@click.command( +@schema_cli.command( name="migrate", short_help="Convert an h5ad to the latest schema version.", help="Convert an h5ad from the previous to latest minor schema version. No validation will be " @@ -94,9 +97,5 @@ def migrate(input_file, output_file, collection_id, dataset_id): migrate(input_file, output_file, collection_id, dataset_id) -schema_cli.add_command(schema_validate) -schema_cli.add_command(migrate) -schema_cli.add_command(remove_labels) - if __name__ == "__main__": schema_cli() diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 434983bc..4f92ef05 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -2130,10 +2130,6 @@ def validate( # Perform validation start = datetime.now() - if verbose: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO, format="%(message)s") validator = Validator( ignore_labels=ignore_labels, ) From be904b30f768679601aa65e74f1a4eb5d9275fab Mon Sep 17 00:00:00 2001 From: Brian Raymor Date: Tue, 3 Dec 2024 13:48:47 -0800 Subject: [PATCH 18/28] updated sex_ontology_term_id (#1145) --- schema/drafts/5.3.0.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/schema/drafts/5.3.0.md b/schema/drafts/5.3.0.md index ba1cd0ab..7090d268 100644 --- a/schema/drafts/5.3.0.md +++ b/schema/drafts/5.3.0.md @@ -1011,7 +1011,12 @@ If organism_ontolology_term_id is "NCBITaxon:9606" for Value - categorical with str categories. This MUST be a descendant of PATO:0001894 for phenotypic sex or "unknown" if unavailable. + categorical with str categories. This MUST be "unknown" if unavailable; otherwise, this MUST be one of:

    + @@ -2061,7 +2066,7 @@ When a dataset is uploaded, CELLxGENE Discover MUST automatically add the `schem * Updated _Visium Spatial Gene Expression_ table row to _Descendants of Visium Spatial Gene Expression_ * Added matrix requirements for _Visium CytAssist Spatial Gene Expression, 11mm_. * obs (Cell metadata) - * Updated the requirements for `array_col`: + * Updated the requirements for `array_col`: * MUST be annotated if the `assay_ontology_term_id` is a descendant of _Visium Spatial Gene Expression_ * Added ranges for _Visium CytAssist Spatial Gene Expression, 6.5mm_ and _Visium CytAssist Spatial Gene Expression, 11mm_ * Updated the requirements for `array_row`: @@ -2076,6 +2081,7 @@ When a dataset is uploaded, CELLxGENE Discover MUST automatically add the `schem * Added genetic_ancestry_Oceanian * Added genetic_ancestry_South_Asian * Updated the requirements for `in_tissue` to include descendants of _Visium Spatial Gene Expression_. + * Updated the requirements for `sex_ontology_term_id` to limit values to female, hermaphrodite, male, or `"unknown"` * obsm (Embeddings) * Updated the requirements for `spatial` to include descendants of _Visium Spatial Gene Expression_ and to prohibit 'Not a Number' values. * Updated the requirements for `X_{suffix}` to include descendants of _Visium Spatial Gene Expression_. From 3d2fa6ddff1e2bcf2be99b04136fdfbed82fe2d8 Mon Sep 17 00:00:00 2001 From: Joyce Yan <5653616+joyceyan@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:20:43 -0800 Subject: [PATCH 19/28] chore: add back return value from is_seurat_convertible (#1147) --- cellxgene_schema_cli/cellxgene_schema/validate.py | 11 ++++++----- cellxgene_schema_cli/tests/test_validate.py | 8 ++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 4f92ef05..60d3a6ba 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -2115,7 +2115,7 @@ def validate( add_labels_file: str = None, ignore_labels: bool = False, verbose: bool = False, -) -> (bool, list): +) -> (bool, list, bool): from .write_labels import AnnDataLabelAppender """ @@ -2124,7 +2124,8 @@ def validate( :param Union[str, bytes, os.PathLike] h5ad_path: Path to h5ad file to validate :param str add_labels_file: Path to new h5ad file with ontology/gene labels added - :return (True, []) if successful validation, (False, [list_of_errors]) otherwise + :return (True, [], False) if successful validation, (False, [list_of_errors], False) otherwise; + last bool is for seurat convertability which is deprecated / unused :rtype tuple """ @@ -2138,7 +2139,7 @@ def validate( # Stop if validation was unsuccessful if not validator.is_valid: - return False, validator.errors + return False, validator.errors, False if add_labels_file: label_start = datetime.now() @@ -2149,6 +2150,6 @@ def validate( f"{writer.was_writing_successful}" ) - return (validator.is_valid and writer.was_writing_successful, validator.errors + writer.errors) + return (validator.is_valid and writer.was_writing_successful, validator.errors + writer.errors, False) - return True, validator.errors + return True, validator.errors, False diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 9ea024b1..b60f2a19 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -297,7 +297,7 @@ def test__validate_with_h5ad_valid_and_labels(self): with tempfile.TemporaryDirectory() as temp_dir: labels_path = "/".join([temp_dir, "labels.h5ad"]) - success, errors = validate(h5ad_valid, labels_path) + success, errors, _ = validate(h5ad_valid, labels_path) import anndata as ad @@ -312,7 +312,7 @@ def test__validate_with_h5ad_valid_and_labels(self): assert original_hash != expected_hash, "Writing labels did not change the dataset from the original." def test__validate_with_h5ad_valid_and_without_labels(self): - success, errors = validate(h5ad_valid) + success, errors, _ = validate(h5ad_valid) assert success assert not errors @@ -321,14 +321,14 @@ def test__validate_with_h5ad_invalid_and_with_labels(self): with tempfile.TemporaryDirectory() as temp_dir: labels_path = "/".join([temp_dir, "labels.h5ad"]) - success, errors = validate(h5ad_invalid, labels_path) + success, errors, _ = validate(h5ad_invalid, labels_path) assert not success assert errors assert not os.path.exists(labels_path) def test__validate_with_h5ad_invalid_and_without_labels(self): - success, errors = validate(h5ad_invalid) + success, errors, _ = validate(h5ad_invalid) assert not success assert errors From b517b86b5d07bcf9520196990c856137ab743a6c Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Fri, 6 Dec 2024 09:59:57 -0500 Subject: [PATCH 20/28] feat: differential tissue position row/col max sizes for visium and visium 11 (#1143) Co-authored-by: Evan Molinelli --- .../cellxgene_schema/validate.py | 29 ++++++++- cellxgene_schema_cli/tests/test_validate.py | 65 ++++++++++++------- 2 files changed, 69 insertions(+), 25 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 60d3a6ba..5510536e 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -4,7 +4,7 @@ import os import re from datetime import datetime -from typing import Dict, List, Mapping, Optional, Union +from typing import Dict, List, Mapping, Optional, Tuple, Union import anndata import matplotlib.colors as mcolors @@ -29,6 +29,10 @@ VISIUM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 4992 VISIUM_11MM_AND_IS_SINGLE_TRUE_MATRIX_SIZE = 14336 +VISIUM_TISSUE_POSITION_MAX_ROW = 77 +VISIUM_TISSUE_POSITION_MAX_COL = 127 +VISIUM_11MM_TISSUE_POSITION_MAX_ROW = 127 +VISIUM_11MM_TISSUE_POSITION_MAX_COL = 223 SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE = 2000 SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE_VISIUM_11MM = 4000 @@ -57,6 +61,7 @@ def __init__(self, ignore_labels=False): self._visium_and_is_single_true_matrix_size = None self._hires_max_dimension_size = None self._visium_error_suffix = None + self._visium_tissue_position_max = None # Values will be instances of gencode.GeneChecker, # keys will be one of gencode.SupportedOrganisms @@ -122,6 +127,24 @@ def hires_max_dimension_size(self) -> Optional[int]: self._hires_max_dimension_size = SPATIAL_HIRES_IMAGE_MAX_DIMENSION_SIZE return self._hires_max_dimension_size + @property + def tissue_position_maxes(self) -> Tuple[int, int]: + if self._visium_tissue_position_max is None and self._is_visium_and_is_single_true: + # visium 11 has different requirements than other visium + if ( + self.adata.obs["assay_ontology_term_id"] + .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True)) + .astype(bool) + .any() + ): + self._visium_tissue_position_max = ( + VISIUM_11MM_TISSUE_POSITION_MAX_ROW, + VISIUM_11MM_TISSUE_POSITION_MAX_COL, + ) + else: + self._visium_tissue_position_max = (VISIUM_TISSUE_POSITION_MAX_ROW, VISIUM_TISSUE_POSITION_MAX_COL) + return self._visium_tissue_position_max + def _is_single(self) -> bool | None: """ Determine value of uns.spatial.is_single. None if non-spatial. @@ -1732,8 +1755,8 @@ def _validate_spatial_tissue_positions(self): :rtype none """ - self._validate_spatial_tissue_position("array_col", 0, 127) - self._validate_spatial_tissue_position("array_row", 0, 77) + self._validate_spatial_tissue_position("array_col", 0, self.tissue_position_maxes[1]) + self._validate_spatial_tissue_position("array_row", 0, self.tissue_position_maxes[0]) self._validate_spatial_tissue_position("in_tissue", 0, 1) def _check_spatial_uns(self): diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index b60f2a19..cd7652bf 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -1,5 +1,6 @@ import hashlib import os +import re import tempfile from typing import Union from unittest import mock @@ -1011,21 +1012,32 @@ def test__validate_tissue_position_required(self, tissue_position_name): validator.adata = adata_visium.copy() validator.adata.obs.pop(tissue_position_name) + # check visium + validator.adata.obs["assay_ontology_term_id"] = "EFO:0010961" validator._check_spatial_obs() assert validator.errors assert ( f"obs['{tissue_position_name}'] {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED}." in validator.errors[0] ) + validator.reset() + + # check visium descendant + validator.adata.obs["assay_ontology_term_id"] = "EFO:0022860" + validator._check_spatial_obs() + assert validator.errors + assert ( + f"obs['{tissue_position_name}'] {ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE_REQUIRED}." in validator.errors[0] + ) + validator.reset() - @pytest.mark.parametrize("assay_ontology_term_id", ["EFO:0010961", "EFO:0030062"]) + @pytest.mark.parametrize("assay_ontology_term_id", ["EFO:0010961", "EFO:0030062", "EFO:0022860"]) def test__validate_tissue_position_not_required(self, assay_ontology_term_id): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_slide_seqv2.copy() validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id - validator.adata.uns["spatial"]["is_single"] = False + validator.adata.uns["spatial"]["is_single"] = False # setting to false removes the requirement validator.adata.obs["is_primary_data"] = False - validator._check_spatial_obs() assert not validator.errors @@ -1041,43 +1053,52 @@ def test__validate_tissue_position_int_error(self, tissue_position_name): assert validator.errors assert f"obs['{tissue_position_name}'] must be of int type" in validator.errors[0] - @pytest.mark.parametrize( - "tissue_position_name, min, error_message_token", - [ - ("array_col", 0, "between 0 and 127"), - ("array_row", 0, "between 0 and 77"), - ("in_tissue", 0, "0 or 1"), - ], - ) - def test__validate_tissue_position_int_min_error(self, tissue_position_name, min, error_message_token): + @pytest.mark.parametrize("assay_ontology_term_id", ["EFO:0010961", "EFO:0022860", "EFO:0022859"]) + @pytest.mark.parametrize("tissue_position_name, min", [("array_col", 0), ("array_row", 0), ("in_tissue", 0)]) + def test__validate_tissue_position_int_min_error(self, assay_ontology_term_id, tissue_position_name, min): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id validator.adata.obs[tissue_position_name] = min - 1 # Confirm tissue_position is identified as invalid. validator._check_spatial_obs() - assert validator.errors - assert f"obs['{tissue_position_name}'] must be {error_message_token}" in validator.errors[0] + assert ( + re.match(f"^obs\['{tissue_position_name}'\] must be (between )?{min} (and|or) [0-9]+", validator.errors[0]) + is not None + ) @pytest.mark.parametrize( - "tissue_position_name, max, error_message_token", + "assay_ontology_term_id, tissue_position_name, tissue_position_max", [ - ("array_col", 127, "between 0 and 127"), - ("array_row", 77, "between 0 and 77"), - ("in_tissue", 1, "0 or 1"), + ("EFO:0010961", "array_col", 127), + ("EFO:0010961", "array_row", 77), + ("EFO:0022860", "array_col", 223), + ("EFO:0022860", "array_row", 127), + ("EFO:0022859", "array_col", 127), + ("EFO:0022859", "array_row", 77), + ("EFO:0022859", "in_tissue", 1), ], ) - def test__validate_tissue_position_int_max_error(self, tissue_position_name, max, error_message_token): + def test__validate_tissue_position_int_max_error( + self, assay_ontology_term_id, tissue_position_name, tissue_position_max + ): validator: Validator = Validator() validator._set_schema_def() validator.adata = adata_visium.copy() - validator.adata.obs[tissue_position_name] = max + 1 + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + validator.adata.obs[tissue_position_name] = tissue_position_max + 1 # Confirm tissue_position is identified as invalid. validator._check_spatial_obs() - assert validator.errors - assert f"obs['{tissue_position_name}'] must be {error_message_token}" in validator.errors[0] + assert ( + re.match( + f"^obs\['{tissue_position_name}'\] must be (between )?[0-9]+ (and|or) {tissue_position_max}", + validator.errors[0], + ) + is not None + ) @pytest.mark.parametrize( "cell_type_ontology_term_id, in_tissue, assay_ontology_term_id", From 9f84b18fab6d5b58f008171469037ddca6cc6e9f Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Fri, 6 Dec 2024 10:48:24 -0500 Subject: [PATCH 21/28] feat: support for visium descendants in obs['assay_ontology_term_id'] (#1148) Co-authored-by: Evan Molinelli --- .../cellxgene_schema/validate.py | 13 +++-- .../tests/test_schema_compliance.py | 54 ++++++++++++++++--- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 5510536e..781b72d1 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -67,7 +67,7 @@ def __init__(self, ignore_labels=False): # keys will be one of gencode.SupportedOrganisms self.gene_checkers = dict() - def reset(self): + def reset(self, hi_res_size: Optional[int] = None, true_mat_size: Optional[int] = None): self.errors = [] self.warnings = [] self.is_valid = False @@ -76,6 +76,8 @@ def reset(self): self.is_spatial = None self.is_visium = None self.is_visium_and_is_single_true = None + self._hires_max_dimension_size = hi_res_size + self._visium_and_is_single_true_matrix_size = true_mat_size # Matrix (e.g., X, raw.X, ...) number non-zero cache self.number_non_zero = dict() @@ -99,6 +101,7 @@ def visium_and_is_single_true_matrix_size(self) -> Optional[int]: if bool( self.adata.obs["assay_ontology_term_id"] .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True)) + .astype(bool) .any() ): self._visium_error_suffix = f"{ERROR_SUFFIX_VISIUM_11M} and {ERROR_SUFFIX_IS_SINGLE}" @@ -118,6 +121,7 @@ def hires_max_dimension_size(self) -> Optional[int]: if bool( self.adata.obs["assay_ontology_term_id"] .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True)) + .astype(bool) .any() ): self._visium_error_suffix = ERROR_SUFFIX_VISIUM_11M @@ -172,7 +176,7 @@ def _is_supported_spatial_assay(self) -> bool: try: _spatial = ( self._is_visium_including_descendants() - or self.adata.obs.assay_ontology_term_id.isin([ASSAY_SLIDE_SEQV2]).any() + or self.adata.obs.assay_ontology_term_id.isin([ASSAY_SLIDE_SEQV2]).astype(bool).any() ) self.is_spatial = bool(_spatial) except AttributeError: @@ -1981,6 +1985,7 @@ def _is_visium_including_descendants(self) -> bool: self.adata.obs[_assay_key] .astype("string") .apply(lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True)) + .astype(bool) .any() ) @@ -2099,8 +2104,6 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None) -> bo :rtype bool """ logger.info("Starting validation...") - # Re-start errors in case a new h5ad is being validated - self.reset() if h5ad_path: logger.debug("Reading the h5ad file...") @@ -2108,6 +2111,8 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None) -> bo self.h5ad_path = h5ad_path self._validate_encoding_version() logger.debug("Successfully read the h5ad file") + # Re-start errors in case a new h5ad is being validated + self.reset() # Fetches schema def for latest major schema version self._set_schema_def() diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 616da096..aa69890e 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -17,6 +17,7 @@ from cellxgene_schema.validate import ( ASSAY_VISIUM_11M, ERROR_SUFFIX_IS_SINGLE, + ERROR_SUFFIX_SPATIAL, ERROR_SUFFIX_VISIUM, ERROR_SUFFIX_VISIUM_11M, ERROR_SUFFIX_VISIUM_AND_IS_SINGLE_TRUE, @@ -85,8 +86,8 @@ def validator_with_spatial_and_is_single_false(validator) -> Validator: @pytest.fixture def validator_with_visium_assay(validator) -> Validator: validator.adata = examples.adata_visium.copy() - validator._visium_and_is_single_true_matrix_size = 2 - validator._hires_max_dimension_size = None + validator.reset(None, None) + return validator @@ -207,6 +208,7 @@ def test_raw_values__invalid_spatial(self, validator_with_visium_assay, invalid_ validator = validator_with_visium_assay validator.adata.raw.X[0, 1] = invalid_value + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: All non-zero values in raw matrix must be positive integers of type numpy.float32.", @@ -247,7 +249,8 @@ def test_raw_values__contains_zero_row_in_tissue_1(self, validator_with_visium_a Raw Matrix contains a row with all zeros and in_tissue is 1, but no values are in_tissue 0. """ - validator = validator_with_visium_assay + validator: Validator = validator_with_visium_assay + validator.reset(None, 2) validator.adata.obs["in_tissue"] = 1 validator.adata.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) validator.adata.raw.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) @@ -265,6 +268,7 @@ def test_raw_values__contains_zero_row_in_tissue_1_mixed_in_tissue_values(self, validator: Validator = validator_with_visium_assay validator.adata.X[1] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) validator.adata.raw.X[1] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: Each observation with obs['in_tissue'] == 1 must have at least one " @@ -286,6 +290,7 @@ def test_raw_values__contains_all_zero_rows_in_tissue_0(self, validator_with_vis ) validator.adata.raw = validator.adata.copy() validator.adata.raw.var.drop("feature_is_filtered", axis=1, inplace=True) + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: If obs['in_tissue'] contains at least one value 0, then there must be at least " @@ -304,6 +309,7 @@ def test_raw_values__contains_some_zero_rows_in_tissue_0(self, validator_with_vi validator.adata.obs["cell_type_ontology_term_id"] = "unknown" validator.adata.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) validator.adata.raw.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [] @@ -328,8 +334,6 @@ def test_raw_values__invalid_visium_and_is_single_true_row_length( validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id # hires image size must be present in order to validate the raw. - validator._visium_and_is_single_true_matrix_size = None - validator._hires_max_dimension_size = image_size validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = numpy.zeros( (1, image_size, 3), dtype=numpy.uint8 ) @@ -640,15 +644,40 @@ def test_assay_ontology_term_id__as_categorical(self, validator_with_visium_assa validator: Validator = validator_with_visium_assay # check encoding as string - validator._check_spatial_obs() + validator.reset(None, 2) + validator._check_spatial() + validator._validate_raw() assert validator.errors == [] - validator.reset() # force encoding as 'categorical' + validator.reset(None, 2) validator.adata.obs["assay_ontology_term_id"] = validator.adata.obs["assay_ontology_term_id"].astype("category") - validator._check_spatial_obs() + validator._check_spatial() + validator._validate_raw() assert validator.errors == [] + @pytest.mark.parametrize( + "assay_ontology_term_id, all_same", + [("EFO:0010961", True), ("EFO:0030062", True), ("EFO:0022860", True), ("EFO:0008995", False)], + ) + def test_assay_ontology_term_id__all_same(self, validator_with_visium_assay, assay_ontology_term_id, all_same): + """ + Spatial assays (descendants of Visium Spatia Gene Expression, or Slide-SeqV2) require all values in the column to be identical. + """ + validator: Validator = validator_with_visium_assay + + # mix values (with otherwise allowed values) + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + validator.adata.obs["assay_ontology_term_id"].iloc[0] = "EFO:0010183" + + # check that unique values are allowed + validator._check_spatial_obs() + EXPECTED_ERROR = f"When {ERROR_SUFFIX_SPATIAL}, all observations must contain the same value." + if all_same: + assert EXPECTED_ERROR in validator.errors + else: + assert validator.errors not in validator.errors + def test_cell_type_ontology_term_id_invalid_term(self, validator_with_adata): validator = validator_with_adata validator.adata.obs.loc[validator.adata.obs.index[0], "cell_type_ontology_term_id"] = "EFO:0000001" @@ -1698,6 +1727,7 @@ def test_genetic_ancestry_same_donor_id(self, validator_with_adata): # Second row should have identical donor id + genetic ancestry values, so this should pass validation validator.adata.obs.iloc[1] = validator.adata.obs.iloc[0].values + validator.validate_adata() assert validator.errors == [] @@ -1708,11 +1738,13 @@ def test_genetic_ancestry_same_donor_id(self, validator_with_adata): validator.adata.obs["genetic_ancestry_Indigenous_American"] = [0.0, 0.0] validator.adata.obs["genetic_ancestry_Oceanian"] = [0.0, 0.0] validator.adata.obs["genetic_ancestry_South_Asian"] = [0.0, 0.0] + validator.reset(None, 2) validator.validate_adata() assert len(validator.errors) > 0 # Change the donor id back to two different donor id's. Now, this should pass validation validator.adata.obs["donor_id"] = original_donor_id_column + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [] @@ -1795,6 +1827,7 @@ def test_feature_is_filtered(self, validator_with_adata): X[i, 0] = 0 X[0, 0] = 1 + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: Some features are 'True' in 'feature_is_filtered' of dataframe 'var', " @@ -1804,6 +1837,7 @@ def test_feature_is_filtered(self, validator_with_adata): # Test that feature_is_filtered is a bool and not a string var["feature_is_filtered"] = "string" + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: Column 'feature_is_filtered' in dataframe 'var' must be boolean, not 'object'." @@ -2383,6 +2417,7 @@ def test_obsm_values_nan(self, validator_with_visium_assay, key): # Check embedding has any NaN obsm[key][0:100, 1] = numpy.nan + validator.reset(None, 2) validator.validate_adata() if key != "spatial": @@ -2393,6 +2428,7 @@ def test_obsm_values_nan(self, validator_with_visium_assay, key): # Check embedding has all NaNs all_nan = numpy.full(obsm[key].shape, numpy.nan) obsm[key] = all_nan + validator.reset(None, 2) validator.validate_adata() if key != "spatial": assert validator.errors == [f"ERROR: adata.obsm['{key}'] contains all NaN values."] @@ -2419,6 +2455,7 @@ def test_obsm_values_no_X_embedding__visium_dataset(self, validator_with_visium_ validator = validator_with_visium_assay validator.adata.uns["default_embedding"] = "spatial" del validator.adata.obsm["X_umap"] + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [] assert validator.is_spatial is True @@ -2522,6 +2559,7 @@ def test_obsm_key_name_whitespace(self, validator_with_adata): del obsm["X_ umap"] obsm["u m a p"] = obsm["X_umap"] + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: Embedding key in 'adata.obsm' u m a p does not match the regex pattern ^[a-zA-Z][a-zA-Z0-9_.-]*$." From 6de64f64ab00c995cc067b786eb418d35ecea725 Mon Sep 17 00:00:00 2001 From: Evan Molinelli Date: Fri, 6 Dec 2024 16:49:20 -0500 Subject: [PATCH 22/28] feat: X_{suffix} to include descendants of Visium (#1144) Co-authored-by: Evan Molinelli --- .../tests/test_schema_compliance.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index aa69890e..85baab4b 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -2436,7 +2436,10 @@ def test_obsm_values_nan(self, validator_with_visium_assay, key): assert validator.errors == ["ERROR: adata.obs['spatial] contains at least one NaN value."] def test_obsm_values_no_X_embedding__non_spatial_dataset(self, validator_with_adata): - validator = validator_with_adata + """ + X_{suffix} embeddings MUST exist for non-spatial datasets + """ + validator: Validator = validator_with_adata validator.adata.obsm["harmony"] = validator.adata.obsm["X_umap"] validator.adata.uns["default_embedding"] = "harmony" del validator.adata.obsm["X_umap"] @@ -2451,14 +2454,27 @@ def test_obsm_values_no_X_embedding__non_spatial_dataset(self, validator_with_ad "WARNING: Validation of raw layer was not performed due to current errors, try again after fixing current errors.", ] - def test_obsm_values_no_X_embedding__visium_dataset(self, validator_with_visium_assay): - validator = validator_with_visium_assay + @pytest.mark.parametrize("assay_ontology_term_id", ["EFO:0010961", "EFO:0030062", "EFO:0022860"]) + def test_obsm_values_no_X_embedding__visium_dataset(self, validator_with_visium_assay, assay_ontology_term_id): + """ + X_{suffix} embeddings MAY exist for spatial datasets + """ + validator: Validator = validator_with_visium_assay validator.adata.uns["default_embedding"] = "spatial" - del validator.adata.obsm["X_umap"] - validator.reset(None, 2) - validator.validate_adata() + validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id + + # may have X_{suffix} embedding + validator._validate_obsm() + assert validator.is_spatial is True assert validator.errors == [] + validator.reset() + + # may also have no X_{suffix} embedding + del validator.adata.obsm["X_umap"] + validator._validate_obsm() assert validator.is_spatial is True + assert validator.errors == [] + validator.reset() def test_obsm_values_no_X_embedding__slide_seq_v2_dataset(self, validator_with_slide_seq_v2_assay): validator = validator_with_slide_seq_v2_assay From a6c9086bc57d0d374701d5ac2a5941cf1517c2de Mon Sep 17 00:00:00 2001 From: Joyce Yan <5653616+joyceyan@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:13:00 -0800 Subject: [PATCH 23/28] chore: update logging to log by donor id instead (#1150) --- cellxgene_schema_cli/cellxgene_schema/validate.py | 9 ++++++--- cellxgene_schema_cli/tests/test_schema_compliance.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 781b72d1..6b62f796 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -554,9 +554,10 @@ def is_valid_row(row): invalid_rows = ~self.adata.obs.apply(is_valid_row, axis=1) if invalid_rows.any(): - invalid_indices = self.adata.obs.index[invalid_rows].tolist() + donor_ids = self.adata.obs[donor_id_column].tolist() + unique_donor_ids = list(set(donor_ids)) self.errors.append( - f"obs rows with indices {invalid_indices} have invalid genetic_ancestry_* values. All " + f"obs rows with donor ids {unique_donor_ids} have invalid genetic_ancestry_* values. All " f"observations with the same donor_id must contain the same genetic_ancestry_* values. If " f"organism_ontolology_term_id is NOT 'NCBITaxon:9606' for Homo sapiens, then all genetic" f"ancestry values MUST be float('nan'). If organism_ontolology_term_id is 'NCBITaxon:9606' " @@ -959,7 +960,6 @@ def _validate_dataframe(self, df_name: str): f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with " f"zero observations. These categories will be removed when `--add-labels` flag is present." ) - self._validate_genetic_ancestry() categorical_types = {type(x) for x in column.dtype.categories.values} # Check for columns that have illegal categories, which are not supported by anndata 0.8.0 # TODO: check if this can be removed after upgading to anndata 0.10.0 @@ -2058,6 +2058,9 @@ def _deep_check(self): # Checks spatial self._check_spatial() + # Validate genetic ancestry + self._validate_genetic_ancestry() + # Checks each component for component_name, component_def in self.schema_def["components"].items(): logger.debug(f"Validating component: {component_name}") diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 85baab4b..bf83c97b 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -1740,7 +1740,7 @@ def test_genetic_ancestry_same_donor_id(self, validator_with_adata): validator.adata.obs["genetic_ancestry_South_Asian"] = [0.0, 0.0] validator.reset(None, 2) validator.validate_adata() - assert len(validator.errors) > 0 + assert len(validator.errors) == 1 # Change the donor id back to two different donor id's. Now, this should pass validation validator.adata.obs["donor_id"] = original_donor_id_column From c2f497926bc8468bba6f196c0d90b0ae22789d5e Mon Sep 17 00:00:00 2001 From: Trent Smith <1429913+Bento007@users.noreply.github.com> Date: Wed, 11 Dec 2024 15:37:17 -0800 Subject: [PATCH 24/28] fix(devop): code coverage failing (#1149) --- .github/workflows/push_tests.yml | 13 +++++++++---- codecov.yaml | 27 +++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 codecov.yaml diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index cd5577d4..86de15a3 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -57,8 +57,9 @@ jobs: uses: actions/upload-artifact@v4 with: name: coverage-cli - path: ./.coverage* + path: .coverage* retention-days: 3 + include-hidden-files: true unit-tests-migration-assistant: runs-on: ubuntu-latest @@ -88,8 +89,9 @@ jobs: uses: actions/upload-artifact@v4 with: name: coverage-migration-assisstant - path: ./.coverage* + path: .coverage* retention-days: 3 + include-hidden-files: true unit-test-ontology-dry-run: runs-on: ubuntu-latest @@ -119,8 +121,9 @@ jobs: uses: actions/upload-artifact@v4 with: name: coverage-ontology-dry-run - path: ./.coverage* + path: .coverage* retention-days: 3 + include-hidden-files: true unit-test-genes-dry-run: runs-on: ubuntu-latest @@ -150,8 +153,9 @@ jobs: uses: actions/upload-artifact@v4 with: name: coverage-genes-dry-run - path: ./.coverage* + path: .coverage* retention-days: 3 + include-hidden-files: true submit-codecoverage: needs: @@ -184,6 +188,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 with: + token: ${{ secrets.CODECOV_TOKEN }} env_vars: OS,PYTHON files: ./coverage.xml flags: unittests diff --git a/codecov.yaml b/codecov.yaml new file mode 100644 index 00000000..9dbaf5d1 --- /dev/null +++ b/codecov.yaml @@ -0,0 +1,27 @@ +comment: + layout: "header, diff, components" + +component_management: + default_rules: + statuses: + - type: project + target: auto + branches: + - "!main" + individual_components: + - component_id: module_cellxgene_schema_cli + name: cellxgene_schema_cli + paths: + - cellxgene_schema_cli/** + - component_id: module_migration_assistant + name: migration_assistant + paths: + - scripts/migration_assistant/** + - component_id: module_schema_bump_dry_run_genes + name: schema_bump_dry_run_genes + paths: + - scripts/schema_bump_dry_run_genes/** + - component_id: module_schema_bump_dry_run_ontologies + name: schema_bump_dry_run_ontologies + paths: + - scripts/schema_bump_dry_run_ontologies/** From ee393cd4b77b7ff156b13c705f3d09f7f83c0133 Mon Sep 17 00:00:00 2001 From: Nayib Gloria <55710092+nayib-jose-gloria@users.noreply.github.com> Date: Mon, 16 Dec 2024 10:11:49 -0500 Subject: [PATCH 25/28] feat: refactor suspension type validation logic to be simpler and more performant (#1155) --- .../schema_definitions/schema_definition.yaml | 286 +++++------------- .../cellxgene_schema/validate.py | 86 ++---- .../tests/test_schema_compliance.py | 25 +- 3 files changed, 117 insertions(+), 280 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml index d14a0442..28a153d7 100644 --- a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml +++ b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml @@ -186,7 +186,12 @@ components: type: curie dependencies: - # If tissue_type is tissue OR organoid - rule: "tissue_type == 'tissue' | tissue_type == 'organoid'" + rule: + column: tissue_type + match_exact: + terms: + - tissue + - organoid error_message_suffix: >- When 'tissue_type' is 'tissue' or 'organoid', 'tissue_ontology_term_id' MUST be a descendant term id of 'UBERON:0001062' (anatomical entity). @@ -199,7 +204,11 @@ components: UBERON: - UBERON:0001062 - # If tissue_type is cell culture - rule: "tissue_type == 'cell culture'" + rule: + column: tissue_type + match_exact: + terms: + - cell culture error_message_suffix: >- When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be either a CL term (excluding 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell), @@ -222,7 +231,11 @@ components: type: curie dependencies: - # If organism is Human - rule: "organism_ontology_term_id == 'NCBITaxon:9606'" + rule: + column: organism_ontology_term_id + match_exact: + terms: + - NCBITaxon:9606 error_message_suffix: >- When 'organism_ontology_term_id' is 'NCBITaxon:9606' (Homo sapiens), self_reported_ethnicity_ontology_term_id MUST be formatted as one @@ -285,7 +298,11 @@ components: type: curie dependencies: - # If organism is Human - rule: "organism_ontology_term_id == 'NCBITaxon:9606'" + rule: + column: organism_ontology_term_id + match_exact: + terms: + - NCBITaxon:9606 error_message_suffix: >- When 'organism_ontology_term_id' is 'NCBITaxon:9606' (Homo sapiens), 'development_stage_ontology_term_id' MUST be the most accurate descendant of 'HsapDv:0000001' or unknown. @@ -300,7 +317,11 @@ components: exceptions: - unknown - # If organism is Mouse - rule: "organism_ontology_term_id == 'NCBITaxon:10090'" + rule: + column: organism_ontology_term_id + match_exact: + terms: + - NCBITaxon:10090 error_message_suffix: >- When 'organism_ontology_term_id' is 'NCBITaxon:10090' (Mus musculus), 'development_stage_ontology_term_id' MUST be the most accurate descendant of 'MmusDv:0000001' or unknown. @@ -353,227 +374,70 @@ components: selected the most appropriate value for the assay(s) between 'cell', 'nucleus', and 'na'. Please contact cellxgene@chanzuckerberg.com during submission so that the assay(s) can be added to the schema definition document. dependencies: - - # If assay_ontology_term_id is EFO:0030080 or its descendants, 'suspension_type' MUST be 'cell' or 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id + - # 'suspension_type' MUST be 'cell' or 'nucleus' + rule: + column: assay_ontology_term_id + match_ancestors_inclusive: ancestors: - EFO: - - EFO:0030080 - inclusive: True + - EFO:0030080 + - EFO:0010184 + match_exact: + terms: + - EFO:0010010 + - EFO:0008722 + - EFO:0010550 + - EFO:0008780 + - EFO:0700010 + - EFO:0700011 + - EFO:0009919 + - EFO:0030060 + - EFO:0022490 + - EFO:0030028 type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030080 or its descendants enum: - "cell" - "nucleus" - - # If assay_ontology_term_id is EFO:0007045 or its descendants, 'suspension_type' MUST be 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id + - # 'suspension_type' MUST be 'nucleus' + rule: + column: assay_ontology_term_id + match_ancestors_inclusive: ancestors: - EFO: - - EFO:0007045 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0007045 or its descendants - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0010184 or its descendants, 'suspension_type' MUST be 'cell' or 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0010184 - inclusive: True + - EFO:0007045 + - EFO:0002761 + match_exact: + terms: + - EFO:0008720 + - EFO:0030026 type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0010184 or its descendants enum: - - "cell" - "nucleus" - - # If assay_ontology_term_id is EFO:0008994 or its descendants, 'suspension_type' MUST be 'na' - complex_rule: - match_ancestors: - column: assay_ontology_term_id + - #'suspension_type' MUST be 'cell' + rule: + column: assay_ontology_term_id + match_ancestors_inclusive: ancestors: - EFO: - - EFO:0008994 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008994 or its descendants - enum: - - "na" - - # If assay_ontology_term_id is EFO:0008919 or its descendants, 'suspension_type' MUST be 'cell' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0008919 - inclusive: True + - EFO:0008919 + match_exact: + terms: + - EFO:0030002 + - EFO:0008853 + - EFO:0008796 + - EFO:0700003 + - EFO:0700004 + - EFO:0008953 type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008919 or its descendants enum: - "cell" - - # If assay_ontology_term_id is EFO:0002761 or its descendants, 'suspension_type' MUST be 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id + - # 'suspension_type' MUST be 'na' + rule: + column: assay_ontology_term_id + match_ancestors_inclusive: ancestors: - EFO: - - EFO:0002761 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0002761 or its descendants - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0010010, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0010010'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0010010 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008720, 'suspension_type' MUST be 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0008720'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008720 - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0008722, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0008722'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008722 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0030002, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0030002'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030002 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0008853, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0008853'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008853 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0030026, 'suspension_type' MUST be 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0030026'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030026 - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0010550, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0010550'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0010550 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008796, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0008796'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008796 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0700003, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0700003'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700003 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0700004, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0700004'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700004 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0008780, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0008780'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008780 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008953, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0008953'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008953 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0700010, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0700010'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700010 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0700011, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0700011'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700011 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0009919, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0009919'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0009919 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0030060, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0030060'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030060 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0022490, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0022490'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0022490 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0030028, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0030028'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030028 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008992, 'suspension_type' MUST be 'na' - rule: "assay_ontology_term_id == 'EFO:0008992'" + - EFO:0008994 + match_exact: + terms: + - EFO:0008992 type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008992 enum: - "na" tissue_type: diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 6b62f796..292cc8ec 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -13,7 +13,6 @@ import scipy from anndata._core.sparse_dataset import SparseDataset from cellxgene_ontology_guide.ontology_parser import OntologyParser -from pandas.errors import UndefinedVariableError from scipy import sparse from . import gencode, schema @@ -630,7 +629,9 @@ def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: s f"these features must be 0." ) - def _validate_column(self, column: pd.Series, column_name: str, df_name: str, column_def: dict): + def _validate_column( + self, column: pd.Series, column_name: str, df_name: str, column_def: dict, default_error_message_suffix=None + ): """ Given a schema definition and the column of a dataframe, verify that the column satisfies the schema. If there are any errors, it adds them to self.errors @@ -640,6 +641,7 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co :param str df_name: Name of the dataframe :param dict column_def: schema definition for this specific column, e.g. schema_def["obs"]["columns"]["cell_type_ontology_term_id"] + :param str default_error_message_suffix: default error message suffix to be added to errors found here :rtype None """ @@ -708,10 +710,11 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co self._validate_curie_str(term_str, column_name, column_def["curie_constraints"]) # Add error suffix to errors found here - if "error_message_suffix" in column_def: + error_message_suffix = column_def.get("error_message_suffix", default_error_message_suffix) + if error_message_suffix: error_total_count = len(self.errors) for i in range(error_original_count, error_total_count): - self.errors[i] = self.errors[i] + " " + column_def["error_message_suffix"] + self.errors[i] = self.errors[i] + " " + error_message_suffix def _validate_column_dependencies( self, df: pd.DataFrame, df_name: str, column_name: str, dependencies: List[dict] @@ -731,73 +734,38 @@ def _validate_column_dependencies( """ all_rules = [] - for dependency_def in dependencies: - if "complex_rule" in dependency_def: - if "match_ancestors" in dependency_def["complex_rule"]: - query_fn, args = self._generate_match_ancestors_query_fn( - dependency_def["complex_rule"]["match_ancestors"] - ) - term_id, ontologies, ancestors, ancestor_inclusive = args - query_exp = f"@query_fn({term_id}, {ontologies}, {ancestors}, {ancestor_inclusive})" - elif "rule" in dependency_def: - query_exp = dependency_def["rule"] - else: - continue - + terms_to_match = set() + column_to_match = dependency_def["rule"]["column"] + if "match_ancestors_inclusive" in dependency_def["rule"]: + ancestors = dependency_def["rule"]["match_ancestors_inclusive"]["ancestors"] + for ancestor in ancestors: + terms_to_match.update(ONTOLOGY_PARSER.get_term_descendants(ancestor, include_self=True)) + if "match_exact" in dependency_def["rule"]: + terms_to_match.update(dependency_def["rule"]["match_exact"]["terms"]) try: - column = getattr(df.query(query_exp, engine="python"), column_name) - except UndefinedVariableError: + match_query = df[column_to_match].isin(terms_to_match) + match_df = df[match_query] + column = getattr(match_df, column_name) + error_message_suffix = dependency_def.get("error_message_suffix", None) + if not error_message_suffix: + matched_values = list(getattr(match_df, column_to_match).unique()) + error_message_suffix = f"when '{column_to_match}' is in {matched_values}" + except KeyError: self.errors.append( f"Checking values with dependencies failed for adata.{df_name}['{column_name}'], " f"this is likely due to missing dependent column in adata.{df_name}." ) return pd.Series(dtype=np.float64) - all_rules.append(query_exp) - - self._validate_column(column, column_name, df_name, dependency_def) + all_rules.append(match_query) + self._validate_column(column, column_name, df_name, dependency_def, error_message_suffix) - # Set column with the data that's left - all_rules = " | ".join(all_rules) - column = getattr(df.query("not (" + all_rules + " )", engine="python"), column_name) + # Return column of data that was not matched by any of the rules + column = getattr(df[~np.logical_or.reduce(all_rules)], column_name) return column - def _generate_match_ancestors_query_fn(self, rule_def: Dict): - """ - Generates vectorized function and args to query a pandas dataframe. Function will determine whether values from - a specified column is a descendant term to a group of specified ancestors, returning a Bool. - :param rule_def: defines arguments to pass into vectorized ancestor match validation function - :return: Tuple(function, Tuple(str, List[str], List[str])) - """ - validate_curie_ancestors_vectorized = np.vectorize(self._validate_curie_ancestors) - ancestor_map = rule_def["ancestors"] - inclusive = rule_def["inclusive"] - - # hack: pandas dataframe query doesn't support Dict inputs - ontology_keys = [] - ancestor_list = [] - for key, val in ancestor_map.items(): - ontology_keys.append(key) - ancestor_list.append(val) - - def is_ancestor_match( - term_id: str, - ontologies: List[str], - ancestors: List[str], - ancestor_inclusive: bool, - ) -> bool: - allowed_ancestors = dict(zip(ontologies, ancestors)) - return validate_curie_ancestors_vectorized(term_id, allowed_ancestors, inclusive=ancestor_inclusive) - - return is_ancestor_match, ( - rule_def["column"], - ontology_keys, - ancestor_list, - inclusive, - ) - def _validate_list(self, list_name: str, current_list: List[str], element_type: str): """ Validates the elements of a list based on the type definition. Adds errors to self.errors if any diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index bf83c97b..f78ad6da 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -1484,13 +1484,15 @@ def test_suspension_type(self, validator, assay, suspension_types): if "na" in suspension_types: invalid_suspension_type = "nucleus" if "nucleus" not in suspension_types else "cell" obs = validator.adata.obs - obs.loc[obs.index[1], "suspension_type"] = invalid_suspension_type - obs.loc[obs.index[1], "assay_ontology_term_id"] = assay + obs["suspension_type"] = invalid_suspension_type + obs["assay_ontology_term_id"] = assay + obs["suspension_type"] = obs["suspension_type"].astype("category") + obs["assay_ontology_term_id"] = obs["assay_ontology_term_id"].astype("category") validator.validate_adata() assert validator.errors == [ f"ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values " f"'['{invalid_suspension_type}']'. Values must be one of {suspension_types} when " - f"'assay_ontology_term_id' is {assay}" + f"'assay_ontology_term_id' is in ['{assay}']" ] @pytest.mark.parametrize( @@ -1517,13 +1519,15 @@ def test_suspension_type_ancestors_inclusive(self, validator_with_adata, assay, if "na" in suspension_types: invalid_suspension_type = "nucleus" if "nucleus" not in suspension_types else "cell" obs["suspension_type"] = obs["suspension_type"].cat.remove_unused_categories() - obs.loc[obs.index[1], "assay_ontology_term_id"] = assay - obs.loc[obs.index[1], "suspension_type"] = invalid_suspension_type + obs["suspension_type"] = invalid_suspension_type + obs["assay_ontology_term_id"] = assay + obs["suspension_type"] = obs["suspension_type"].astype("category") + obs["assay_ontology_term_id"] = obs["assay_ontology_term_id"].astype("category") validator.validate_adata() assert validator.errors == [ f"ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values " f"'['{invalid_suspension_type}']'. Values must be one of {suspension_types} when " - f"'assay_ontology_term_id' is {assay} or its descendants" + f"'assay_ontology_term_id' is in ['{assay}']" ] def test_suspension_type_with_descendant_term_id_failure(self, validator_with_adata): @@ -1533,14 +1537,15 @@ def test_suspension_type_with_descendant_term_id_failure(self, validator_with_ad """ validator = validator_with_adata obs = validator.adata.obs - obs.loc[obs.index[0], "assay_ontology_term_id"] = "EFO:0022615" # descendant of EFO:0008994 - obs.loc[obs.index[0], "suspension_type"] = "nucleus" - + obs["suspension_type"] = "nucleus" + obs["assay_ontology_term_id"] = "EFO:0022615" # descendant of EFO:0008994 + obs["suspension_type"] = obs["suspension_type"].astype("category") + obs["assay_ontology_term_id"] = obs["assay_ontology_term_id"].astype("category") validator.validate_adata() assert validator.errors == [ "ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values " "'['nucleus']'. Values must be one of ['na'] when " - "'assay_ontology_term_id' is EFO:0008994 or its descendants" + "'assay_ontology_term_id' is in ['EFO:0022615']" ] def test_suspension_type_with_descendant_term_id_success(self, validator_with_adata): From 2fc4898c6267797740a794cf222e42fdd9b7508d Mon Sep 17 00:00:00 2001 From: Brian Raymor Date: Tue, 17 Dec 2024 17:40:33 -0800 Subject: [PATCH 26/28] Add template for adding species --- .github/ISSUE_TEMPLATE/add-species.md | 228 ++++++++++++++++++++++++++ .github/ISSUE_TEMPLATE/tech-issue.md | 8 +- 2 files changed, 233 insertions(+), 3 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/add-species.md diff --git a/.github/ISSUE_TEMPLATE/add-species.md b/.github/ISSUE_TEMPLATE/add-species.md new file mode 100644 index 00000000..31504d14 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/add-species.md @@ -0,0 +1,228 @@ +--- +name: Add species +about: Editor's template for adding new species +title: Draft +labels: drafting, multispecies discovery, schema +assignees: brianraymor + +--- + +## Pending Issues + +1. Waiting on sscrdv to be submitted to OLS for use in references +1. [FAANG](http://www.faang.org/) is the Functional Annotation of ANimal Genomes project. _We are working to understand the genotype to phenotype link in domesticated animals._ Per their [Ontology Improver](https://data.faang.org/ontology?sortTerm=key&sortDirection=asc), *Dv terms are not referenced. Both UBERON and CL are in use. Their [schema](https://github.com/FAANG/dcc-metadata/blob/9e7c1b5304fc57a724d197384e83243562bebbf4/json_schema/type/samples/faang_samples_specimen.metadata_rules.json#L154): + +``` +"name": "developmental stage", +"description": "Ontology for Developmental stage, UBERON is preferred to EFO.", +``` + + +## Design + +This draft design reflects additions to corresponding sections in [schema 5.2.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/5.2.0/schema.md). Reviewers are expected to be familiar with the CELLxGENE schema. + +**Editorial Notes** that are inlined in the design below will not be surfaced in the schema. + +--- + +### Required Ontologies + + +| Ontology | OBO Prefix | Release | Download | +|:--|:--|:--|:--| +| [Unavailable](https://github.com/OBOFoundry/OBOFoundry.github.io/tree/master/ontology) | SscrDv | [Releases](https://github.com/obophenotype/developmental-stage-ontologies/releases) | TBD | +||||| + + +#### Editorial Notes + +This ontology is under active development. CELLxGENE pins ontology releases in each version of the schema. A specific release of the ontology above must be selected in the future. + + +--- + +### Required Gene Annotations + +| Organism | Source | Required version | Download | +|:--|:--|:--|:--| +| "NCBITaxon:9823"
    for Sus scrofa domesticus | [ENSEMBL (Sus scrofa domesticus)] | Sscrofa11.1 (GCA_000003025.6) | [Sus_scrofa.Sscrofa11.1.113.gtf] | + + +[ENSEMBL (Sus scrofa domesticus)]: https://useast.ensembl.org/Sus_scrofa/Info/Index +[Sus_scrofa.Sscrofa11.1.113.gtf]: https://ftp.ensembl.org/pub/release-113/gtf/sus_scrofa/Sus_scrofa.Sscrofa11.1.113.gtf.gz + +#### Editorial Notes + + +--- + +## `obs` (Cell Metadata) + +### cell_type_ontology_term_id + +No schema changes are required. + +#### Editorial Notes + +--- + +### development_stage_ontology_term_id + + + + + + + + + + + + + + +
    Keydevelopment_stage_ontology_term_id
    AnnotatorCurator MUST annotate.
    Value + categorical with str categories. If unavailable, this MUST be "unknown".

    + If organism_ontolology_term_id is "NCBITaxon:9823" for Sus scrofa domesticus, this MUST be the most accurate descendant of SscrDv:0000000 for life cycle stage. +
    +
    + +#### Editorial Notes + +This may be outdated, but [potential recommendations](https://github.com/obophenotype/developmental-stage-ontologies/blob/master/external/bgee/report.md#sus-scrofa): + +``` +UBERON:0000104 life cycle + UBERON:0000068 embryo stage + UBERON:0000106 zygote stage + UBERON:0000107 cleavage stage + UBERON:0007232 2 cell stage + UBERON:0007233 4 cell stage + UBERON:0007236 8 cell stage + UBERON:0000108 blastula stage + UBERON:0000109 gastrula stage + UBERON:0000110 neurula stage + UBERON:0000111 organogenesis stage + SscrDv:0000081 ridge limb stage (pig) + SscrDv:0000082 bud limb stage (pig) + SscrDv:0000083 paddle limb stage (pig) + UBERON:0007220 late embryonic stage + UBERON:0000092 post-embryonic stage + UBERON:0000066 fully formed stage + UBERON:0000112 sexually immature stage + UBERON:0018685 nursing stage + UBERON:0007221 neonate stage + SscrDv:0000072 0-day-old stage (pig) + SscrDv:0000073 1-day-old stage (pig) + SscrDv:0000074 2-day-old stage (pig) + SscrDv:0000075 3-day-old stage (pig) + SscrDv:0000076 4-day-old stage (pig) + SscrDv:0000077 5-day-old stage (pig) + SscrDv:0000078 6-day-old stage (pig) + UBERON:0034920 infant stage + SscrDv:0000010 1-week-old stage (pig) + SscrDv:0000011 2-week-old stage (pig) + SscrDv:0000012 3-week-old stage (pig) + SscrDv:0000018 21-day-old stage (pig) + SscrDv:0000019 22-day-old stage (pig) + SscrDv:0000020 23-day-old stage (pig) + SscrDv:0000021 24-day-old stage (pig) + SscrDv:0000022 25-day-old stage (pig) + SscrDv:0000023 26-day-old stage (pig) + SscrDv:0000024 27-day-old stage (pig) + SscrDv:0000013 4-week-old stage (pig) + SscrDv:0000025 28-day-old stage (pig) + SscrDv:0000026 29-day-old stage (pig) + SscrDv:0000027 30-day-old stage (pig) + SscrDv:0000028 31-day-old stage (pig) + SscrDv:0000029 32-day-old stage (pig) + SscrDv:0000030 33-day-old stage (pig) + SscrDv:0000031 34-day-old stage (pig) + SscrDv:0000014 5-week-old stage (pig) + SscrDv:0000032 35-day-old stage (pig) + SscrDv:0000033 36-day-old stage (pig) + SscrDv:0000034 37-day-old stage (pig) + SscrDv:0000035 38-day-old stage (pig) + SscrDv:0000036 39-day-old stage (pig) + SscrDv:0000037 40-day-old stage (pig) + SscrDv:0000038 41-day-old stage (pig) + SscrDv:0000015 6-week-old stage (pig) + SscrDv:0000016 7-week-old stage (pig) + UBERON:0034919 juvenile stage + SscrDv:0000039 2-month-old stage (pig) + SscrDv:0000017 8-week-old stage (pig) + SscrDv:0000040 9-week-old stage (pig) + SscrDv:0000041 10-week-old stage (pig) + SscrDv:0000042 11-week-old stage (pig) + SscrDv:0000043 3-month-old stage (pig) + SscrDv:0000044 12-week-old stage (pig) + SscrDv:0000045 13-week-old stage (pig) + SscrDv:0000046 14-week-old stage (pig) + SscrDv:0000047 15-week-old stage (pig) + SscrDv:0000048 4-month-old stage (pig) + SscrDv:0000049 16-week-old stage (pig) + SscrDv:0000050 17-week-old stage (pig) + SscrDv:0000051 18-week-old stage (pig) + SscrDv:0000052 19-week-old stage (pig) + SscrDv:0000053 20-week-old stage (pig) + SscrDv:0000054 5-month-old stage (pig) + SscrDv:0000055 21-week-old stage (pig) + SscrDv:0000056 22-week-old stage (pig) + SscrDv:0000057 23-week-old stage (pig) + SscrDv:0000058 24-week-old stage (pig) + SscrDv:0000059 6-month-old stage (pig) + SscrDv:0000060 7-month-old stage (pig) + SscrDv:0000061 8-month-old stage (pig) + SscrDv:0000062 9-month-old stage (pig) + SscrDv:0000063 10-month-old stage (pig) + UBERON:0000113 post-juvenile + UBERON:0018241 prime adult stage + SscrDv:0000064 11-month-old stage (pig) + SscrDv:0000065 1-year-old stage (pig) + SscrDv:0000066 2-year-old stage (pig) + SscrDv:0000067 3-year-old stage (pig) + SscrDv:0000068 4-year-old stage (pig) + SscrDv:0000069 5-year-old stage (pig) + SscrDv:0000070 6-year-old stage (pig) + SscrDv:0000071 7-year-old stage (pig) + UBERON:0007222 late adult stage +``` + +--- + +### disease_ontology_term_id + +No schema changes are required. + +#### Editorial Notes + +--- + +### organism_ontolology_term_id + +organism_ontolology_term_id is "NCBITaxon:9823" for Sus scrofa domesticus + +--- + +### sex_ontology_term_id + +No schema changes are required. + +#### Editorial Notes + +--- + +### tissue_ontology_term_id + +No schema changes are required. + + +#### Editorial Notes + +--- + +## Reference + + +[BGEE](https://www.bgee.org/species/9823) diff --git a/.github/ISSUE_TEMPLATE/tech-issue.md b/.github/ISSUE_TEMPLATE/tech-issue.md index 4965be8c..7b75dbb8 100644 --- a/.github/ISSUE_TEMPLATE/tech-issue.md +++ b/.github/ISSUE_TEMPLATE/tech-issue.md @@ -1,9 +1,11 @@ --- name: Tech Issue -about: Engineering-specific technical work that is not product-specific. Engineering team "owns" these issues. -title: "" +about: Engineering-specific technical work that is not product-specific. Engineering + team "owns" these issues. +title: '' labels: tech -assignees: "" +assignees: '' + --- ## Motivation From b4b1a5f54534eed0d40135492be8a4cd3f28540c Mon Sep 17 00:00:00 2001 From: Brian Raymor Date: Thu, 19 Dec 2024 15:23:39 -0800 Subject: [PATCH 27/28] updated X matrix requirement (#1157) --- schema/drafts/5.3.0.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/schema/drafts/5.3.0.md b/schema/drafts/5.3.0.md index 7090d268..70de7b79 100644 --- a/schema/drafts/5.3.0.md +++ b/schema/drafts/5.3.0.md @@ -163,9 +163,8 @@ The types below are python3 types. Note that a python3 `str` is a sequence of Un ## `X` (Matrix Layers) -The data stored in the `X` data matrix is the data that is viewable in CELLxGENE Explorer. CELLxGENE does not impose any additional constraints on the `X` data matrix. +The data stored in the `AnnData.X` data matrix is the data that is viewable in CELLxGENE Explorer. For `AnnData.X`, `AnnData.raw.X`, and all layers, if a data matrix contains 50% or more values that are zeros, it MUST be encoded as a [`scipy.sparse.csr_matrix`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html) with zero values encoded as implicit zeros. -In any layer, if a matrix has 50% or more values that are zeros, it is STRONGLY RECOMMENDED that the matrix be encoded as a [`scipy.sparse.csr_matrix`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html) with zero values encoded as implicit zeros. CELLxGENE's matrix layer requirements are tailored to optimize data reuse. Because each assay has different characteristics, the requirements differ by assay type. In general, CELLxGENE requires submission of "raw" data suitable for computational reuse when a standard raw matrix format exists for an assay. It is STRONGLY RECOMMENDED to also include a "normalized" matrix with processed values ready for data analysis and suitable for visualization in CELLxGENE Explorer. So that CELLxGENE's data can be provided in download formats suitable for both R and Python, the schema imposes the following requirements: @@ -2097,6 +2096,8 @@ When a dataset is uploaded, CELLxGENE Discover MUST automatically add the `schem * Updated the requirements for spatial[library_id]['scalefactors'] to include descendants of _Visium Spatial Gene Expression_. * Updated the requirements for spatial[library_id]['scalefactors']['spot_diameter_fullres'] to include descendants of _Visium Spatial Gene Expression_. * Updated the requirements for spatial[library_id]['scalefactors']['tissue_hires_scalef'] to include descendants of _Visium Spatial Gene Expression_. +* X (Matrix Layers) + * Updated the STRONGLY RECOMMENDED requirement to a MUST. A matrix with 50% or more values that are zeros MUST be encoded as `scipy.sparse.csr_matrix`. ### schema v5.2.0 From e8c97c074bf5c49a9cbb4a08e3715c603cee4429 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 14:16:00 -0800 Subject: [PATCH 28/28] chore(deps): update numpy requirement from <2 to <3 in /cellxgene_schema_cli (#1163) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- cellxgene_schema_cli/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellxgene_schema_cli/requirements.txt b/cellxgene_schema_cli/requirements.txt index d8b2bcdd..18b58ada 100644 --- a/cellxgene_schema_cli/requirements.txt +++ b/cellxgene_schema_cli/requirements.txt @@ -2,7 +2,7 @@ anndata>=0.8,<0.11 cellxgene-ontology-guide==1.3.0 # update before a schema migration click<9 Cython<4 -numpy<2 +numpy<3 pandas>2,<3 PyYAML<7 scipy<2