From dbde2f0c30c2a407dc4ed632040fcfdfdded31cd Mon Sep 17 00:00:00 2001 From: nayib-jose-gloria Date: Mon, 7 Oct 2024 15:53:19 -0400 Subject: [PATCH 1/3] refactor suspension type validation --- .../schema_definitions/schema_definition.yaml | 319 +++++------------- .../cellxgene_schema/validate.py | 54 +-- .../tests/test_schema_compliance.py | 35 +- 3 files changed, 104 insertions(+), 304 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml index d14a0442e..3af820390 100644 --- a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml +++ b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml @@ -136,6 +136,94 @@ components: add_labels: - type: curie to_column: assay + dependencies: + - # If suspension_type is cell + rule: "suspension_type == 'cell'" + error_message_suffix: >- + 'suspension_type' is 'cell', but 'assay_ontology_term_id' does not match one of the corresponding assays + for 'cell' in the schema definition. + type: curie + curie_constraints: + ontologies: + - EFO + allowed: + terms: + EFO: + - EFO:0030080 + - EFO:0700004 + - EFO:0700003 + - EFO:0010010 + - EFO:0008722 + - EFO:0700011 + - EFO:0008780 + - EFO:0008796 + - EFO:0030060 + - EFO:0030002 + - EFO:0008853 + - EFO:0022490 + - EFO:0010550 + - EFO:0030028 + - EFO:0008919 + - EFO:0010184 + - EFO:0009919 + - EFO:0008953 + - EFO:0700010 + ancestors: + EFO: + - EFO:0030080 + - EFO:0008919 + - EFO:0010184 + - # If suspension_type is nucleus + rule: "suspension_type == 'nucleus'" + error_message_suffix: >- + 'suspension_type' is 'nucleus', but 'assay_ontology_term_id' does not match one of the corresponding + assays for 'nucleus' in the schema definition. + type: curie + curie_constraints: + ontologies: + - EFO + allowed: + terms: + EFO: + - EFO:0030080 + - EFO:0007045 + - EFO:0010010 + - EFO:0008720 + - EFO:0008722 + - EFO:0700011 + - EFO:0008780 + - EFO:0030060 + - EFO:0002761 + - EFO:0022490 + - EFO:0030026 + - EFO:0010550 + - EFO:0030028 + - EFO:0010184 + - EFO:0009919 + - EFO:0700010 + ancestors: + EFO: + - EFO:0030080 + - EFO:0007045 + - EFO:0002761 + - EFO:0010184 + - # If suspension_type is na + rule: "suspension_type == 'na'" + error_message_suffix: >- + 'suspension_type' is 'na', but 'assay_ontology_term_id' does not match one of the corresponding + assays for 'na' in the schema definition. + type: curie + curie_constraints: + ontologies: + - EFO + allowed: + terms: + EFO: + - EFO:0008992 + - EFO:0008994 + ancestors: + EFO: + - EFO:0008994 disease_ontology_term_id: error_message_suffix: "Only 'PATO:0000461' (normal), 'MONDO:0021178' (injury) or descendant terms thereof, or descendant terms of 'MONDO:0000001' (disease) are allowed" type: curie @@ -345,237 +433,6 @@ components: - "cell" - "nucleus" - "na" - error_message_suffix: >- - when 'assay_ontology_term_id' does not match one of the assays in the schema definition. - # if no dependencies are matched - warning_message: >- - Data contains assay(s) that are not represented in the 'suspension_type' schema definition table. Ensure you have - selected the most appropriate value for the assay(s) between 'cell', 'nucleus', and 'na'. Please contact cellxgene@chanzuckerberg.com - during submission so that the assay(s) can be added to the schema definition document. - dependencies: - - # If assay_ontology_term_id is EFO:0030080 or its descendants, 'suspension_type' MUST be 'cell' or 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0030080 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030080 or its descendants - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0007045 or its descendants, 'suspension_type' MUST be 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0007045 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0007045 or its descendants - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0010184 or its descendants, 'suspension_type' MUST be 'cell' or 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0010184 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0010184 or its descendants - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008994 or its descendants, 'suspension_type' MUST be 'na' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0008994 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008994 or its descendants - enum: - - "na" - - # If assay_ontology_term_id is EFO:0008919 or its descendants, 'suspension_type' MUST be 'cell' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0008919 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008919 or its descendants - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0002761 or its descendants, 'suspension_type' MUST be 'nucleus' - complex_rule: - match_ancestors: - column: assay_ontology_term_id - ancestors: - EFO: - - EFO:0002761 - inclusive: True - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0002761 or its descendants - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0010010, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0010010'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0010010 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008720, 'suspension_type' MUST be 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0008720'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008720 - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0008722, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0008722'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008722 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0030002, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0030002'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030002 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0008853, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0008853'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008853 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0030026, 'suspension_type' MUST be 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0030026'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030026 - enum: - - "nucleus" - - # If assay_ontology_term_id is EFO:0010550, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0010550'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0010550 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008796, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0008796'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008796 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0700003, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0700003'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700003 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0700004, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0700004'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700004 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0008780, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0008780'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008780 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008953, 'suspension_type' MUST be 'cell' - rule: "assay_ontology_term_id == 'EFO:0008953'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008953 - enum: - - "cell" - - # If assay_ontology_term_id is EFO:0700010, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0700010'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700010 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0700011, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0700011'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0700011 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0009919, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0009919'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0009919 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0030060, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0030060'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030060 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0022490, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0022490'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0022490 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0030028, 'suspension_type' MUST be 'cell' or 'nucleus' - rule: "assay_ontology_term_id == 'EFO:0030028'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0030028 - enum: - - "cell" - - "nucleus" - - # If assay_ontology_term_id is EFO:0008992, 'suspension_type' MUST be 'na' - rule: "assay_ontology_term_id == 'EFO:0008992'" - type: categorical - error_message_suffix: >- - when 'assay_ontology_term_id' is EFO:0008992 - enum: - - "na" tissue_type: type: categorical enum: diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 01a2d1401..9ac706d0c 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -641,7 +641,7 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co self.errors[i] = self.errors[i] + " " + column_def["error_message_suffix"] def _validate_column_dependencies( - self, df: pd.DataFrame, df_name: str, column_name: str, dependencies: List[dict] + self, df: pd.DataFrame, df_name: str, column_name: str, column_def: dict ) -> pd.Series: """ Validates subset of columns based on dependencies, for instance development_stage_ontology_term_id has @@ -654,20 +654,15 @@ def _validate_column_dependencies( :param pd.DataFrame df: pandas dataframe containing the column to be validated :param str df_name: the name of dataframe in the adata object, e.g. "obs" :param str column_name: the name of the column to be validated - :param list dependencies: a list of dependency definitions, which is a list of column definitions with a "rule" + :param dict column_def:dict of column definitions, including dependency definitions, which is a list of + column definitions with a "rule" """ all_rules = [] + dependencies = column_def["dependencies"] for dependency_def in dependencies: - if "complex_rule" in dependency_def: - if "match_ancestors" in dependency_def["complex_rule"]: - query_fn, args = self._generate_match_ancestors_query_fn( - dependency_def["complex_rule"]["match_ancestors"] - ) - term_id, ontologies, ancestors, ancestor_inclusive = args - query_exp = f"@query_fn({term_id}, {ontologies}, {ancestors}, {ancestor_inclusive})" - elif "rule" in dependency_def: + if "rule" in dependency_def: query_exp = dependency_def["rule"] else: continue @@ -683,6 +678,7 @@ def _validate_column_dependencies( all_rules.append(query_exp) + self._validate_column(column, column_name, df_name, column_def) self._validate_column(column, column_name, df_name, dependency_def) # Set column with the data that's left @@ -691,40 +687,6 @@ def _validate_column_dependencies( return column - def _generate_match_ancestors_query_fn(self, rule_def: Dict): - """ - Generates vectorized function and args to query a pandas dataframe. Function will determine whether values from - a specified column is a descendant term to a group of specified ancestors, returning a Bool. - :param rule_def: defines arguments to pass into vectorized ancestor match validation function - :return: Tuple(function, Tuple(str, List[str], List[str])) - """ - validate_curie_ancestors_vectorized = np.vectorize(self._validate_curie_ancestors) - ancestor_map = rule_def["ancestors"] - inclusive = rule_def["inclusive"] - - # hack: pandas dataframe query doesn't support Dict inputs - ontology_keys = [] - ancestor_list = [] - for key, val in ancestor_map.items(): - ontology_keys.append(key) - ancestor_list.append(val) - - def is_ancestor_match( - term_id: str, - ontologies: List[str], - ancestors: List[str], - ancestor_inclusive: bool, - ) -> bool: - allowed_ancestors = dict(zip(ontologies, ancestors)) - return validate_curie_ancestors_vectorized(term_id, allowed_ancestors, inclusive=ancestor_inclusive) - - return is_ancestor_match, ( - rule_def["column"], - ontology_keys, - ancestor_list, - inclusive, - ) - def _validate_list(self, list_name: str, current_list: List[str], element_type: str): """ Validates the elements of a list based on the type definition. Adds errors to self.errors if any @@ -919,12 +881,10 @@ def _validate_dataframe(self, df_name: str): # First check if there are dependencies with other columns and work with a subset of the data if so if "dependencies" in column_def: - column = self._validate_column_dependencies(df, df_name, column_name, column_def["dependencies"]) + column = self._validate_column_dependencies(df, df_name, column_name, column_def) # If after validating dependencies there's still values in the column, validate them. if len(column) > 0: - if "warning_message" in column_def: - self.warnings.append(column_def["warning_message"]) self._validate_column(column, column_name, df_name, column_def) def _validate_uns_dict(self, uns_dict: dict) -> None: diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 7ee65a6df..11eca9541 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -1371,9 +1371,9 @@ def test_suspension_type(self, validator, assay, suspension_types): obs.loc[obs.index[1], "assay_ontology_term_id"] = assay validator.validate_adata() assert validator.errors == [ - f"ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values " - f"'['{invalid_suspension_type}']'. Values must be one of {suspension_types} when " - f"'assay_ontology_term_id' is {assay}" + f"ERROR: '{assay}' in 'assay_ontology_term_id' is not an allowed term id. 'suspension_type' is " + f"'{invalid_suspension_type}', but 'assay_ontology_term_id' does not match one of the corresponding assays " + f"for '{invalid_suspension_type}' in the schema definition." ] @pytest.mark.parametrize( @@ -1404,9 +1404,9 @@ def test_suspension_type_ancestors_inclusive(self, validator_with_adata, assay, obs.loc[obs.index[1], "suspension_type"] = invalid_suspension_type validator.validate_adata() assert validator.errors == [ - f"ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values " - f"'['{invalid_suspension_type}']'. Values must be one of {suspension_types} when " - f"'assay_ontology_term_id' is {assay} or its descendants" + f"ERROR: '{assay}' in 'assay_ontology_term_id' is not an allowed term id. 'suspension_type' is " + f"'{invalid_suspension_type}', but 'assay_ontology_term_id' does not match one of the corresponding assays " + f"for '{invalid_suspension_type}' in the schema definition." ] def test_suspension_type_with_descendant_term_id_failure(self, validator_with_adata): @@ -1421,9 +1421,9 @@ def test_suspension_type_with_descendant_term_id_failure(self, validator_with_ad validator.validate_adata() assert validator.errors == [ - "ERROR: Column 'suspension_type' in dataframe 'obs' contains invalid values " - "'['nucleus']'. Values must be one of ['na'] when " - "'assay_ontology_term_id' is EFO:0008994 or its descendants" + "ERROR: 'EFO:0022615' in 'assay_ontology_term_id' is not an allowed term id. 'suspension_type' is " + "'nucleus', but 'assay_ontology_term_id' does not match one of the corresponding assays " + "for 'nucleus' in the schema definition." ] def test_suspension_type_with_descendant_term_id_success(self, validator_with_adata): @@ -1439,23 +1439,6 @@ def test_suspension_type_with_descendant_term_id_success(self, validator_with_ad validator.validate_adata() assert validator.errors == [] - def test_suspension_type_unrecognized_assay(self, validator_with_adata): - """ - suspension_id categorical with str categories. This field MUST be "cell", "nucleus", or "na". The allowed - values depend on the assay_ontology_term_id. MUST warn if the corresponding assay is not recognized. - """ - validator = validator_with_adata - obs = validator.adata.obs - obs.loc[obs.index[1], "assay_ontology_term_id"] = "EFO:0700005" - validator.validate_adata() - assert validator.errors == [] - assert validator.warnings == [ - "WARNING: Data contains assay(s) that are not represented in the 'suspension_type' schema " - "definition table. Ensure you have selected the most appropriate value for the assay(s) between " - "'cell', 'nucleus', and 'na'. Please contact cellxgene@chanzuckerberg.com " - "during submission so that the assay(s) can be added to the schema definition document." - ] - def test_categories_with_zero_values_warn(self, validator_with_adata): validator = validator_with_adata obs = validator.adata.obs From 3a21ca1c59fdad89aeb427e2c7658f6b17104d35 Mon Sep 17 00:00:00 2001 From: nayib-jose-gloria Date: Mon, 7 Oct 2024 17:02:40 -0400 Subject: [PATCH 2/3] refactor suspension type validation + tests --- .../schema_definitions/schema_definition.yaml | 56 ++++++++++--------- .../cellxgene_schema/validate.py | 41 +++++--------- .../tests/test_schema_compliance.py | 17 +++--- 3 files changed, 52 insertions(+), 62 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml index 3af820390..02ad3c800 100644 --- a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml +++ b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml @@ -357,15 +357,17 @@ components: multi_term: delimiter: "," sorted: True - # If organism is not Human - error_message_suffix: >- - When 'organism_ontology_term_id' is NOT 'NCBITaxon:9606' (Homo sapiens), - self_reported_ethnicity_ontology_term_id MUST be 'na'. - curie_constraints: - ontologies: - - NA - exceptions: - - na + - # If organism is not human + rule: "organism_ontology_term_id != 'NCBITaxon:9606'" + error_message_suffix: >- + When 'organism_ontology_term_id' is NOT 'NCBITaxon:9606' (Homo sapiens), + self_reported_ethnicity_ontology_term_id MUST be 'na'. + type: curie + curie_constraints: + ontologies: + - NA + exceptions: + - na add_labels: - type: curie to_column: self_reported_ethnicity @@ -402,23 +404,25 @@ components: - MmusDv:0000001 exceptions: - unknown - # If organism is not humnan nor mouse - error_message_suffix: >- - When 'organism_ontology_term_id' is not 'NCBITaxon:10090' nor 'NCBITaxon:9606', - 'development_stage_ontology_term_id' MUST be a descendant term id of 'UBERON:0000105' - excluding 'UBERON:0000071', or unknown. - curie_constraints: - ontologies: - - UBERON - allowed: - ancestors: - UBERON: - - UBERON:0000105 - exceptions: - - unknown - forbidden: - terms: - - UBERON:0000071 + - # If organism is not human nor mouse + rule: "organism_ontology_term_id != 'NCBITaxon:9606' & organism_ontology_term_id != 'NCBITaxon:10090'" + error_message_suffix: >- + When 'organism_ontology_term_id' is not 'NCBITaxon:10090' nor 'NCBITaxon:9606', + 'development_stage_ontology_term_id' MUST be a descendant term id of 'UBERON:0000105' + excluding 'UBERON:0000071', or unknown. + type: curie + curie_constraints: + ontologies: + - UBERON + allowed: + ancestors: + UBERON: + - UBERON:0000105 + exceptions: + - unknown + forbidden: + terms: + - UBERON:0000071 add_labels: - type: curie to_column: development_stage diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 9ac706d0c..c6a6f6932 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -560,7 +560,7 @@ def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: s def _validate_column(self, column: pd.Series, column_name: str, df_name: str, column_def: dict): """ Given a schema definition and the column of a dataframe, verify that the column satisfies the schema. - If there are any errors, it adds them to self.errors + If there are any errors, it adds them to self.errors. Returns whether column is valid. :param pandas.Series column: Column of a dataframe to validate :param str column_name: Name of the column in the dataframe @@ -568,7 +568,7 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co :param dict column_def: schema definition for this specific column, e.g. schema_def["obs"]["columns"]["cell_type_ontology_term_id"] - :rtype None + :rtype bool: True if no errors caught, False if errors caught sand thus not valid """ # error_original_count will count the number of error messages prior to validating the column, this @@ -628,20 +628,25 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co # Check for NaN values if column.isnull().any(): self.errors.append(f"Column '{column_name}' in dataframe '{df_name}' must not contain NaN values.") - return + return False if "curie_constraints" in column_def: for term_str in column.drop_duplicates(): self._validate_curie_str(term_str, column_name, column_def["curie_constraints"]) + error_total_count = len(self.errors) + # Add error suffix to errors found here if "error_message_suffix" in column_def: - error_total_count = len(self.errors) for i in range(error_original_count, error_total_count): self.errors[i] = self.errors[i] + " " + column_def["error_message_suffix"] + if error_total_count > error_original_count: + return False + return True + def _validate_column_dependencies( - self, df: pd.DataFrame, df_name: str, column_name: str, column_def: dict + self, df: pd.DataFrame, df_name: str, column_name: str, dependencies: List[str] ) -> pd.Series: """ Validates subset of columns based on dependencies, for instance development_stage_ontology_term_id has @@ -654,13 +659,8 @@ def _validate_column_dependencies( :param pd.DataFrame df: pandas dataframe containing the column to be validated :param str df_name: the name of dataframe in the adata object, e.g. "obs" :param str column_name: the name of the column to be validated - :param dict column_def:dict of column definitions, including dependency definitions, which is a list of - column definitions with a "rule" + :param list[str] dependencies: dependency definitions, which is a list of column definitions with a "rule" """ - - all_rules = [] - dependencies = column_def["dependencies"] - for dependency_def in dependencies: if "rule" in dependency_def: query_exp = dependency_def["rule"] @@ -675,18 +675,8 @@ def _validate_column_dependencies( f"this is likely due to missing dependent column in adata.{df_name}." ) return pd.Series(dtype=np.float64) - - all_rules.append(query_exp) - - self._validate_column(column, column_name, df_name, column_def) self._validate_column(column, column_name, df_name, dependency_def) - # Set column with the data that's left - all_rules = " | ".join(all_rules) - column = getattr(df.query("not (" + all_rules + " )", engine="python"), column_name) - - return column - def _validate_list(self, list_name: str, current_list: List[str], element_type: str): """ Validates the elements of a list based on the type definition. Adds errors to self.errors if any @@ -879,13 +869,10 @@ def _validate_dataframe(self, df_name: str): column_def = self._get_column_def(df_name, column_name) column = getattr(df, column_name) - # First check if there are dependencies with other columns and work with a subset of the data if so - if "dependencies" in column_def: - column = self._validate_column_dependencies(df, df_name, column_name, column_def) + is_column_valid = self._validate_column(column, column_name, df_name, column_def) - # If after validating dependencies there's still values in the column, validate them. - if len(column) > 0: - self._validate_column(column, column_name, df_name, column_def) + if is_column_valid and "dependencies" in column_def: + self._validate_column_dependencies(df, df_name, column_name, column_def["dependencies"]) def _validate_uns_dict(self, uns_dict: dict) -> None: df = getattr_anndata(self.adata, "obs") diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 11eca9541..c3bb24faa 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -473,9 +473,6 @@ def test_column_presence_assay(self, validator_with_adata): validator.validate_adata() assert validator.errors == [ "ERROR: Dataframe 'obs' is missing column " "'assay_ontology_term_id'.", - "ERROR: Checking values with dependencies failed for " - "adata.obs['suspension_type'], this is likely due " - "to missing dependent column in adata.obs.", ] @pytest.mark.parametrize( @@ -962,7 +959,7 @@ def test_self_reported_ethnicity_ontology_term_id__non_human(self, validator_wit validator = validator_with_adata error_message_suffix = validator.schema_def["components"]["obs"]["columns"][ "self_reported_ethnicity_ontology_term_id" - ]["error_message_suffix"] + ]["dependencies"][-1]["error_message_suffix"] # Mouse organism ID validator.adata.obs.loc[validator.adata.obs.index[0], "organism_ontology_term_id"] = "NCBITaxon:10090" # Required to set to avoid development_stage_ontology_term_id errors @@ -1095,11 +1092,13 @@ def test_self_reported_ethnicity_ontology_term_id__multi_term_list(self, validat "self_reported_ethnicity_ontology_term_id", ] = ["HANCESTRO:0005,HANCESTRO:0014"] validator.validate_adata() - assert validator.errors[1] == self.get_format_error_message( - error_message_suffix, - "ERROR: '['HANCESTRO:0005,HANCESTRO:0014']' in 'self_reported_ethnicity_ontology_term_id' is not " - "a valid ontology term value, it must be a string.", - ) + assert validator.errors == [ + self.get_format_error_message( + error_message_suffix, + "ERROR: '['HANCESTRO:0005,HANCESTRO:0014']' in 'self_reported_ethnicity_ontology_term_id' is not " + "a valid ontology term value, it must be a string.", + ) + ] def test_organism_ontology_term_id(self, validator_with_adata): """ From 561178be62e5a36304fa70f179d4f52dd80b9513 Mon Sep 17 00:00:00 2001 From: nayib-jose-gloria Date: Mon, 2 Dec 2024 12:56:37 -0500 Subject: [PATCH 3/3] fix test + typo --- cellxgene_schema_cli/cellxgene_schema/validate.py | 2 +- cellxgene_schema_cli/tests/test_schema_compliance.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index c6a6f6932..c7e7461ae 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -568,7 +568,7 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co :param dict column_def: schema definition for this specific column, e.g. schema_def["obs"]["columns"]["cell_type_ontology_term_id"] - :rtype bool: True if no errors caught, False if errors caught sand thus not valid + :rtype bool: True if no errors caught, False if errors caught and thus not valid """ # error_original_count will count the number of error messages prior to validating the column, this diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index c3bb24faa..596cbe32f 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -1092,13 +1092,11 @@ def test_self_reported_ethnicity_ontology_term_id__multi_term_list(self, validat "self_reported_ethnicity_ontology_term_id", ] = ["HANCESTRO:0005,HANCESTRO:0014"] validator.validate_adata() - assert validator.errors == [ - self.get_format_error_message( - error_message_suffix, - "ERROR: '['HANCESTRO:0005,HANCESTRO:0014']' in 'self_reported_ethnicity_ontology_term_id' is not " - "a valid ontology term value, it must be a string.", - ) - ] + assert validator.errors[1] == self.get_format_error_message( + error_message_suffix, + "ERROR: '['HANCESTRO:0005,HANCESTRO:0014']' in 'self_reported_ethnicity_ontology_term_id' is not " + "a valid ontology term value, it must be a string.", + ) def test_organism_ontology_term_id(self, validator_with_adata): """