From dbde2f0c30c2a407dc4ed632040fcfdfdded31cd Mon Sep 17 00:00:00 2001
From: nayib-jose-gloria <ngloria@chanzuckerberg.com>
Date: Mon, 7 Oct 2024 15:53:19 -0400
Subject: [PATCH 1/3] refactor suspension type validation

---
 .../schema_definitions/schema_definition.yaml | 319 +++++-------------
 .../cellxgene_schema/validate.py              |  54 +--
 .../tests/test_schema_compliance.py           |  35 +-
 3 files changed, 104 insertions(+), 304 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
index d14a0442e..3af820390 100644
--- a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
+++ b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
@@ -136,6 +136,94 @@ components:
         add_labels:
           - type: curie
             to_column: assay
+        dependencies:
+          - # If suspension_type is cell
+            rule: "suspension_type == 'cell'"
+            error_message_suffix: >-
+              'suspension_type' is 'cell', but 'assay_ontology_term_id' does not match one of the corresponding assays
+              for 'cell' in the schema definition.
+            type: curie
+            curie_constraints:
+              ontologies:
+                - EFO
+              allowed:
+                terms:
+                  EFO:
+                    - EFO:0030080
+                    - EFO:0700004
+                    - EFO:0700003
+                    - EFO:0010010
+                    - EFO:0008722
+                    - EFO:0700011
+                    - EFO:0008780
+                    - EFO:0008796
+                    - EFO:0030060
+                    - EFO:0030002
+                    - EFO:0008853
+                    - EFO:0022490
+                    - EFO:0010550
+                    - EFO:0030028
+                    - EFO:0008919
+                    - EFO:0010184
+                    - EFO:0009919
+                    - EFO:0008953
+                    - EFO:0700010
+                ancestors:
+                  EFO:
+                    - EFO:0030080
+                    - EFO:0008919
+                    - EFO:0010184
+          - # If suspension_type is nucleus
+            rule: "suspension_type == 'nucleus'"
+            error_message_suffix: >-
+              'suspension_type' is 'nucleus', but 'assay_ontology_term_id' does not match one of the corresponding
+              assays for 'nucleus' in the schema definition.
+            type: curie
+            curie_constraints:
+              ontologies:
+                - EFO
+              allowed:
+                terms:
+                  EFO:
+                    - EFO:0030080
+                    - EFO:0007045
+                    - EFO:0010010
+                    - EFO:0008720
+                    - EFO:0008722
+                    - EFO:0700011
+                    - EFO:0008780
+                    - EFO:0030060
+                    - EFO:0002761
+                    - EFO:0022490
+                    - EFO:0030026
+                    - EFO:0010550
+                    - EFO:0030028
+                    - EFO:0010184
+                    - EFO:0009919
+                    - EFO:0700010
+                ancestors:
+                  EFO:
+                    - EFO:0030080
+                    - EFO:0007045
+                    - EFO:0002761
+                    - EFO:0010184
+          - # If suspension_type is na
+            rule: "suspension_type == 'na'"
+            error_message_suffix: >-
+              'suspension_type' is 'na', but 'assay_ontology_term_id' does not match one of the corresponding
+              assays for 'na' in the schema definition.
+            type: curie
+            curie_constraints:
+              ontologies:
+                - EFO
+              allowed:
+                terms:
+                  EFO:
+                    - EFO:0008992
+                    - EFO:0008994
+                ancestors:
+                  EFO:
+                    - EFO:0008994
       disease_ontology_term_id:
         error_message_suffix: "Only 'PATO:0000461' (normal), 'MONDO:0021178' (injury) or descendant terms thereof, or descendant terms of 'MONDO:0000001' (disease) are allowed"
         type: curie
@@ -345,237 +433,6 @@ components:
           - "cell"
           - "nucleus"
           - "na"
-        error_message_suffix: >-
-          when 'assay_ontology_term_id' does not match one of the assays in the schema definition.
-        # if no dependencies are matched
-        warning_message: >-
-          Data contains assay(s) that are not represented in the 'suspension_type' schema definition table. Ensure you have
-          selected the most appropriate value for the assay(s) between 'cell', 'nucleus', and 'na'. Please contact cellxgene@chanzuckerberg.com
-          during submission so that the assay(s) can be added to the schema definition document.
-        dependencies:
-          - # If assay_ontology_term_id is EFO:0030080 or its descendants, 'suspension_type' MUST be 'cell' or 'nucleus'
-            complex_rule:
-              match_ancestors:
-                column: assay_ontology_term_id
-                ancestors:
-                  EFO:
-                    - EFO:0030080
-                inclusive: True
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0030080 or its descendants
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0007045 or its descendants, 'suspension_type' MUST be 'nucleus'
-            complex_rule:
-              match_ancestors:
-                column: assay_ontology_term_id
-                ancestors:
-                  EFO:
-                    - EFO:0007045
-                inclusive: True
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0007045 or its descendants
-            enum:
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0010184 or its descendants, 'suspension_type' MUST be 'cell' or 'nucleus'
-            complex_rule:
-              match_ancestors:
-                column: assay_ontology_term_id
-                ancestors:
-                  EFO:
-                    - EFO:0010184
-                inclusive: True
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0010184 or its descendants
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0008994 or its descendants, 'suspension_type' MUST be 'na'
-            complex_rule:
-              match_ancestors:
-                column: assay_ontology_term_id
-                ancestors:
-                  EFO:
-                    - EFO:0008994
-                inclusive: True
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0008994 or its descendants
-            enum:
-              - "na"
-          - # If assay_ontology_term_id is EFO:0008919 or its descendants, 'suspension_type' MUST be 'cell'
-            complex_rule:
-              match_ancestors:
-                column: assay_ontology_term_id
-                ancestors:
-                  EFO:
-                    - EFO:0008919
-                inclusive: True
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0008919 or its descendants
-            enum:
-              - "cell"
-          - # If assay_ontology_term_id is EFO:0002761 or its descendants, 'suspension_type' MUST be 'nucleus'
-            complex_rule:
-              match_ancestors:
-                column: assay_ontology_term_id
-                ancestors:
-                  EFO:
-                    - EFO:0002761
-                inclusive: True
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0002761 or its descendants
-            enum:
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0010010, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0010010'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0010010
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0008720, 'suspension_type' MUST be 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0008720'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0008720
-            enum:
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0008722, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0008722'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0008722
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0030002, 'suspension_type' MUST be 'cell'
-            rule: "assay_ontology_term_id == 'EFO:0030002'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0030002
-            enum:
-              - "cell"
-          - # If assay_ontology_term_id is EFO:0008853, 'suspension_type' MUST be 'cell'
-            rule: "assay_ontology_term_id == 'EFO:0008853'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0008853
-            enum:
-              - "cell"
-          - # If assay_ontology_term_id is EFO:0030026, 'suspension_type' MUST be 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0030026'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0030026
-            enum:
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0010550, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0010550'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0010550
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0008796, 'suspension_type' MUST be 'cell'
-            rule: "assay_ontology_term_id == 'EFO:0008796'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0008796
-            enum:
-              - "cell"
-          - # If assay_ontology_term_id is EFO:0700003, 'suspension_type' MUST be 'cell'
-            rule: "assay_ontology_term_id == 'EFO:0700003'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0700003
-            enum:
-              - "cell"
-          - # If assay_ontology_term_id is EFO:0700004, 'suspension_type' MUST be 'cell'
-            rule: "assay_ontology_term_id == 'EFO:0700004'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0700004
-            enum:
-              - "cell"
-          - # If assay_ontology_term_id is EFO:0008780, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0008780'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0008780
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0008953, 'suspension_type' MUST be 'cell'
-            rule: "assay_ontology_term_id == 'EFO:0008953'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0008953
-            enum:
-              - "cell"
-          - # If assay_ontology_term_id is EFO:0700010, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0700010'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0700010
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0700011, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0700011'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0700011
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0009919, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0009919'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0009919
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0030060, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0030060'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0030060
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0022490, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0022490'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0022490
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0030028, 'suspension_type' MUST be 'cell' or 'nucleus'
-            rule: "assay_ontology_term_id == 'EFO:0030028'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0030028
-            enum:
-              - "cell"
-              - "nucleus"
-          - # If assay_ontology_term_id is EFO:0008992, 'suspension_type' MUST be 'na'
-            rule: "assay_ontology_term_id == 'EFO:0008992'"
-            type: categorical
-            error_message_suffix: >-
-              when 'assay_ontology_term_id' is EFO:0008992
-            enum:
-              - "na"
       tissue_type:
         type: categorical
         enum:
diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index 01a2d1401..9ac706d0c 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -641,7 +641,7 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co
                 self.errors[i] = self.errors[i] + " " + column_def["error_message_suffix"]
 
     def _validate_column_dependencies(
-        self, df: pd.DataFrame, df_name: str, column_name: str, dependencies: List[dict]
+        self, df: pd.DataFrame, df_name: str, column_name: str, column_def: dict
     ) -> pd.Series:
         """
         Validates subset of columns based on dependencies, for instance development_stage_ontology_term_id has
@@ -654,20 +654,15 @@ def _validate_column_dependencies(
         :param pd.DataFrame df: pandas dataframe containing the column to be validated
         :param str df_name: the name of dataframe in the adata object, e.g. "obs"
         :param str column_name: the name of the column to be validated
-        :param list dependencies: a list of dependency definitions, which is a list of column definitions with a "rule"
+        :param dict column_def:dict of column definitions, including dependency definitions, which is a list of
+        column definitions with a "rule"
         """
 
         all_rules = []
+        dependencies = column_def["dependencies"]
 
         for dependency_def in dependencies:
-            if "complex_rule" in dependency_def:
-                if "match_ancestors" in dependency_def["complex_rule"]:
-                    query_fn, args = self._generate_match_ancestors_query_fn(
-                        dependency_def["complex_rule"]["match_ancestors"]
-                    )
-                    term_id, ontologies, ancestors, ancestor_inclusive = args
-                    query_exp = f"@query_fn({term_id}, {ontologies}, {ancestors}, {ancestor_inclusive})"
-            elif "rule" in dependency_def:
+            if "rule" in dependency_def:
                 query_exp = dependency_def["rule"]
             else:
                 continue
@@ -683,6 +678,7 @@ def _validate_column_dependencies(
 
             all_rules.append(query_exp)
 
+            self._validate_column(column, column_name, df_name, column_def)
             self._validate_column(column, column_name, df_name, dependency_def)
 
         # Set column with the data that's left
@@ -691,40 +687,6 @@ def _validate_column_dependencies(
 
         return column
 
-    def _generate_match_ancestors_query_fn(self, rule_def: Dict):
-        """
-        Generates vectorized function and args to query a pandas dataframe. Function will determine whether values from
-        a specified column is a descendant term to a group of specified ancestors, returning a Bool.
-        :param rule_def: defines arguments to pass into vectorized ancestor match validation function
-        :return: Tuple(function, Tuple(str, List[str], List[str]))
-        """
-        validate_curie_ancestors_vectorized = np.vectorize(self._validate_curie_ancestors)
-        ancestor_map = rule_def["ancestors"]
-        inclusive = rule_def["inclusive"]
-
-        # hack: pandas dataframe query doesn't support Dict inputs
-        ontology_keys = []
-        ancestor_list = []
-        for key, val in ancestor_map.items():
-            ontology_keys.append(key)
-            ancestor_list.append(val)
-
-        def is_ancestor_match(
-            term_id: str,
-            ontologies: List[str],
-            ancestors: List[str],
-            ancestor_inclusive: bool,
-        ) -> bool:
-            allowed_ancestors = dict(zip(ontologies, ancestors))
-            return validate_curie_ancestors_vectorized(term_id, allowed_ancestors, inclusive=ancestor_inclusive)
-
-        return is_ancestor_match, (
-            rule_def["column"],
-            ontology_keys,
-            ancestor_list,
-            inclusive,
-        )
-
     def _validate_list(self, list_name: str, current_list: List[str], element_type: str):
         """
         Validates the elements of a list based on the type definition. Adds errors to self.errors if any
@@ -919,12 +881,10 @@ def _validate_dataframe(self, df_name: str):
 
                 # First check if there are dependencies with other columns and work with a subset of the data if so
                 if "dependencies" in column_def:
-                    column = self._validate_column_dependencies(df, df_name, column_name, column_def["dependencies"])
+                    column = self._validate_column_dependencies(df, df_name, column_name, column_def)
 
                 # If after validating dependencies there's still values in the column, validate them.
                 if len(column) > 0:
-                    if "warning_message" in column_def:
-                        self.warnings.append(column_def["warning_message"])
                     self._validate_column(column, column_name, df_name, column_def)
 
     def _validate_uns_dict(self, uns_dict: dict) -> None:
diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py
index 7ee65a6df..11eca9541 100644
--- a/cellxgene_schema_cli/tests/test_schema_compliance.py
+++ b/cellxgene_schema_cli/tests/test_schema_compliance.py
@@ -1371,9 +1371,9 @@ def test_suspension_type(self, validator, assay, suspension_types):
         obs.loc[obs.index[1], "assay_ontology_term_id"] = assay
         validator.validate_adata()
         assert validator.errors == [
-            f"ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values "
-            f"'['{invalid_suspension_type}']'. Values must be one of {suspension_types} when "
-            f"'assay_ontology_term_id' is {assay}"
+            f"ERROR: '{assay}' in 'assay_ontology_term_id' is not an allowed term id. 'suspension_type' is "
+            f"'{invalid_suspension_type}', but 'assay_ontology_term_id' does not match one of the corresponding assays "
+            f"for '{invalid_suspension_type}' in the schema definition."
         ]
 
     @pytest.mark.parametrize(
@@ -1404,9 +1404,9 @@ def test_suspension_type_ancestors_inclusive(self, validator_with_adata, assay,
         obs.loc[obs.index[1], "suspension_type"] = invalid_suspension_type
         validator.validate_adata()
         assert validator.errors == [
-            f"ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values "
-            f"'['{invalid_suspension_type}']'. Values must be one of {suspension_types} when "
-            f"'assay_ontology_term_id' is {assay} or its descendants"
+            f"ERROR: '{assay}' in 'assay_ontology_term_id' is not an allowed term id. 'suspension_type' is "
+            f"'{invalid_suspension_type}', but 'assay_ontology_term_id' does not match one of the corresponding assays "
+            f"for '{invalid_suspension_type}' in the schema definition."
         ]
 
     def test_suspension_type_with_descendant_term_id_failure(self, validator_with_adata):
@@ -1421,9 +1421,9 @@ def test_suspension_type_with_descendant_term_id_failure(self, validator_with_ad
 
         validator.validate_adata()
         assert validator.errors == [
-            "ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values "
-            "'['nucleus']'. Values must be one of ['na'] when "
-            "'assay_ontology_term_id' is EFO:0008994 or its descendants"
+            "ERROR: 'EFO:0022615' in 'assay_ontology_term_id' is not an allowed term id. 'suspension_type' is "
+            "'nucleus', but 'assay_ontology_term_id' does not match one of the corresponding assays "
+            "for 'nucleus' in the schema definition."
         ]
 
     def test_suspension_type_with_descendant_term_id_success(self, validator_with_adata):
@@ -1439,23 +1439,6 @@ def test_suspension_type_with_descendant_term_id_success(self, validator_with_ad
         validator.validate_adata()
         assert validator.errors == []
 
-    def test_suspension_type_unrecognized_assay(self, validator_with_adata):
-        """
-        suspension_id categorical with str categories. This field MUST be "cell", "nucleus", or "na". The allowed
-        values depend on the assay_ontology_term_id. MUST warn if the corresponding assay is not recognized.
-        """
-        validator = validator_with_adata
-        obs = validator.adata.obs
-        obs.loc[obs.index[1], "assay_ontology_term_id"] = "EFO:0700005"
-        validator.validate_adata()
-        assert validator.errors == []
-        assert validator.warnings == [
-            "WARNING: Data contains assay(s) that are not represented in the 'suspension_type' schema "
-            "definition table. Ensure you have selected the most appropriate value for the assay(s) between "
-            "'cell', 'nucleus', and 'na'. Please contact cellxgene@chanzuckerberg.com "
-            "during submission so that the assay(s) can be added to the schema definition document."
-        ]
-
     def test_categories_with_zero_values_warn(self, validator_with_adata):
         validator = validator_with_adata
         obs = validator.adata.obs

From 3a21ca1c59fdad89aeb427e2c7658f6b17104d35 Mon Sep 17 00:00:00 2001
From: nayib-jose-gloria <ngloria@chanzuckerberg.com>
Date: Mon, 7 Oct 2024 17:02:40 -0400
Subject: [PATCH 2/3] refactor suspension type validation + tests

---
 .../schema_definitions/schema_definition.yaml | 56 ++++++++++---------
 .../cellxgene_schema/validate.py              | 41 +++++---------
 .../tests/test_schema_compliance.py           | 17 +++---
 3 files changed, 52 insertions(+), 62 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
index 3af820390..02ad3c800 100644
--- a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
+++ b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
@@ -357,15 +357,17 @@ components:
               multi_term:
                 delimiter: ","
                 sorted: True
-        # If organism is not Human
-        error_message_suffix: >-
-          When 'organism_ontology_term_id' is NOT 'NCBITaxon:9606' (Homo sapiens),
-          self_reported_ethnicity_ontology_term_id MUST be 'na'.
-        curie_constraints:
-          ontologies:
-            - NA
-          exceptions:
-            - na
+          - # If organism is not human
+            rule: "organism_ontology_term_id != 'NCBITaxon:9606'"
+            error_message_suffix: >-
+              When 'organism_ontology_term_id' is NOT 'NCBITaxon:9606' (Homo sapiens),
+              self_reported_ethnicity_ontology_term_id MUST be 'na'.
+            type: curie
+            curie_constraints:
+              ontologies:
+                - NA
+              exceptions:
+                - na
         add_labels:
           - type: curie
             to_column: self_reported_ethnicity
@@ -402,23 +404,25 @@ components:
                     - MmusDv:0000001
               exceptions:
                 - unknown
-        # If organism is not humnan nor mouse
-        error_message_suffix: >-
-          When 'organism_ontology_term_id' is not 'NCBITaxon:10090' nor 'NCBITaxon:9606',
-          'development_stage_ontology_term_id' MUST be a descendant term id of 'UBERON:0000105'
-          excluding 'UBERON:0000071', or unknown.
-        curie_constraints:
-          ontologies:
-            - UBERON
-          allowed:
-            ancestors:
-              UBERON:
-                - UBERON:0000105
-          exceptions:
-            - unknown
-          forbidden:
-            terms:
-              - UBERON:0000071
+          - # If organism is not human nor mouse
+            rule: "organism_ontology_term_id != 'NCBITaxon:9606' & organism_ontology_term_id != 'NCBITaxon:10090'"
+            error_message_suffix: >-
+              When 'organism_ontology_term_id' is not 'NCBITaxon:10090' nor 'NCBITaxon:9606',
+              'development_stage_ontology_term_id' MUST be a descendant term id of 'UBERON:0000105'
+              excluding 'UBERON:0000071', or unknown.
+            type: curie
+            curie_constraints:
+              ontologies:
+                - UBERON
+              allowed:
+                ancestors:
+                  UBERON:
+                    - UBERON:0000105
+              exceptions:
+                - unknown
+              forbidden:
+                terms:
+                  - UBERON:0000071
         add_labels:
           - type: curie
             to_column: development_stage
diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index 9ac706d0c..c6a6f6932 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -560,7 +560,7 @@ def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: s
     def _validate_column(self, column: pd.Series, column_name: str, df_name: str, column_def: dict):
         """
         Given a schema definition and the column of a dataframe, verify that the column satisfies the schema.
-        If there are any errors, it adds them to self.errors
+        If there are any errors, it adds them to self.errors. Returns whether column is valid.
 
         :param pandas.Series column: Column of a dataframe to validate
         :param str column_name: Name of the column in the dataframe
@@ -568,7 +568,7 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co
         :param dict column_def: schema definition for this specific column,
         e.g. schema_def["obs"]["columns"]["cell_type_ontology_term_id"]
 
-        :rtype None
+        :rtype bool: True if no errors caught, False if errors caught sand thus not valid
         """
 
         # error_original_count will count the number of error messages prior to validating the column, this
@@ -628,20 +628,25 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co
             # Check for NaN values
             if column.isnull().any():
                 self.errors.append(f"Column '{column_name}' in dataframe '{df_name}' must not contain NaN values.")
-                return
+                return False
 
             if "curie_constraints" in column_def:
                 for term_str in column.drop_duplicates():
                     self._validate_curie_str(term_str, column_name, column_def["curie_constraints"])
 
+        error_total_count = len(self.errors)
+
         # Add error suffix to errors found here
         if "error_message_suffix" in column_def:
-            error_total_count = len(self.errors)
             for i in range(error_original_count, error_total_count):
                 self.errors[i] = self.errors[i] + " " + column_def["error_message_suffix"]
 
+        if error_total_count > error_original_count:
+            return False
+        return True
+
     def _validate_column_dependencies(
-        self, df: pd.DataFrame, df_name: str, column_name: str, column_def: dict
+        self, df: pd.DataFrame, df_name: str, column_name: str, dependencies: List[str]
     ) -> pd.Series:
         """
         Validates subset of columns based on dependencies, for instance development_stage_ontology_term_id has
@@ -654,13 +659,8 @@ def _validate_column_dependencies(
         :param pd.DataFrame df: pandas dataframe containing the column to be validated
         :param str df_name: the name of dataframe in the adata object, e.g. "obs"
         :param str column_name: the name of the column to be validated
-        :param dict column_def:dict of column definitions, including dependency definitions, which is a list of
-        column definitions with a "rule"
+        :param list[str] dependencies: dependency definitions, which is a list of column definitions with a "rule"
         """
-
-        all_rules = []
-        dependencies = column_def["dependencies"]
-
         for dependency_def in dependencies:
             if "rule" in dependency_def:
                 query_exp = dependency_def["rule"]
@@ -675,18 +675,8 @@ def _validate_column_dependencies(
                     f"this is likely due to missing dependent column in adata.{df_name}."
                 )
                 return pd.Series(dtype=np.float64)
-
-            all_rules.append(query_exp)
-
-            self._validate_column(column, column_name, df_name, column_def)
             self._validate_column(column, column_name, df_name, dependency_def)
 
-        # Set column with the data that's left
-        all_rules = " | ".join(all_rules)
-        column = getattr(df.query("not (" + all_rules + " )", engine="python"), column_name)
-
-        return column
-
     def _validate_list(self, list_name: str, current_list: List[str], element_type: str):
         """
         Validates the elements of a list based on the type definition. Adds errors to self.errors if any
@@ -879,13 +869,10 @@ def _validate_dataframe(self, df_name: str):
                 column_def = self._get_column_def(df_name, column_name)
                 column = getattr(df, column_name)
 
-                # First check if there are dependencies with other columns and work with a subset of the data if so
-                if "dependencies" in column_def:
-                    column = self._validate_column_dependencies(df, df_name, column_name, column_def)
+                is_column_valid = self._validate_column(column, column_name, df_name, column_def)
 
-                # If after validating dependencies there's still values in the column, validate them.
-                if len(column) > 0:
-                    self._validate_column(column, column_name, df_name, column_def)
+                if is_column_valid and "dependencies" in column_def:
+                    self._validate_column_dependencies(df, df_name, column_name, column_def["dependencies"])
 
     def _validate_uns_dict(self, uns_dict: dict) -> None:
         df = getattr_anndata(self.adata, "obs")
diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py
index 11eca9541..c3bb24faa 100644
--- a/cellxgene_schema_cli/tests/test_schema_compliance.py
+++ b/cellxgene_schema_cli/tests/test_schema_compliance.py
@@ -473,9 +473,6 @@ def test_column_presence_assay(self, validator_with_adata):
         validator.validate_adata()
         assert validator.errors == [
             "ERROR: Dataframe 'obs' is missing column " "'assay_ontology_term_id'.",
-            "ERROR: Checking values with dependencies failed for "
-            "adata.obs['suspension_type'], this is likely due "
-            "to missing dependent column in adata.obs.",
         ]
 
     @pytest.mark.parametrize(
@@ -962,7 +959,7 @@ def test_self_reported_ethnicity_ontology_term_id__non_human(self, validator_wit
         validator = validator_with_adata
         error_message_suffix = validator.schema_def["components"]["obs"]["columns"][
             "self_reported_ethnicity_ontology_term_id"
-        ]["error_message_suffix"]
+        ]["dependencies"][-1]["error_message_suffix"]
         # Mouse organism ID
         validator.adata.obs.loc[validator.adata.obs.index[0], "organism_ontology_term_id"] = "NCBITaxon:10090"
         # Required to set to avoid development_stage_ontology_term_id errors
@@ -1095,11 +1092,13 @@ def test_self_reported_ethnicity_ontology_term_id__multi_term_list(self, validat
             "self_reported_ethnicity_ontology_term_id",
         ] = ["HANCESTRO:0005,HANCESTRO:0014"]
         validator.validate_adata()
-        assert validator.errors[1] == self.get_format_error_message(
-            error_message_suffix,
-            "ERROR: '['HANCESTRO:0005,HANCESTRO:0014']' in 'self_reported_ethnicity_ontology_term_id' is not "
-            "a valid ontology term value, it must be a string.",
-        )
+        assert validator.errors == [
+            self.get_format_error_message(
+                error_message_suffix,
+                "ERROR: '['HANCESTRO:0005,HANCESTRO:0014']' in 'self_reported_ethnicity_ontology_term_id' is not "
+                "a valid ontology term value, it must be a string.",
+            )
+        ]
 
     def test_organism_ontology_term_id(self, validator_with_adata):
         """

From 561178be62e5a36304fa70f179d4f52dd80b9513 Mon Sep 17 00:00:00 2001
From: nayib-jose-gloria <ngloria@chanzuckerberg.com>
Date: Mon, 2 Dec 2024 12:56:37 -0500
Subject: [PATCH 3/3] fix test + typo

---
 cellxgene_schema_cli/cellxgene_schema/validate.py    |  2 +-
 cellxgene_schema_cli/tests/test_schema_compliance.py | 12 +++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index c6a6f6932..c7e7461ae 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -568,7 +568,7 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co
         :param dict column_def: schema definition for this specific column,
         e.g. schema_def["obs"]["columns"]["cell_type_ontology_term_id"]
 
-        :rtype bool: True if no errors caught, False if errors caught sand thus not valid
+        :rtype bool: True if no errors caught, False if errors caught and thus not valid
         """
 
         # error_original_count will count the number of error messages prior to validating the column, this
diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py
index c3bb24faa..596cbe32f 100644
--- a/cellxgene_schema_cli/tests/test_schema_compliance.py
+++ b/cellxgene_schema_cli/tests/test_schema_compliance.py
@@ -1092,13 +1092,11 @@ def test_self_reported_ethnicity_ontology_term_id__multi_term_list(self, validat
             "self_reported_ethnicity_ontology_term_id",
         ] = ["HANCESTRO:0005,HANCESTRO:0014"]
         validator.validate_adata()
-        assert validator.errors == [
-            self.get_format_error_message(
-                error_message_suffix,
-                "ERROR: '['HANCESTRO:0005,HANCESTRO:0014']' in 'self_reported_ethnicity_ontology_term_id' is not "
-                "a valid ontology term value, it must be a string.",
-            )
-        ]
+        assert validator.errors[1] == self.get_format_error_message(
+            error_message_suffix,
+            "ERROR: '['HANCESTRO:0005,HANCESTRO:0014']' in 'self_reported_ethnicity_ontology_term_id' is not "
+            "a valid ontology term value, it must be a string.",
+        )
 
     def test_organism_ontology_term_id(self, validator_with_adata):
         """