From f1055f23d6cf810c7b1106a0cb80ea5a01aee6bb Mon Sep 17 00:00:00 2001 From: hhunterzinck Date: Fri, 28 Jan 2022 03:11:02 +0000 Subject: [PATCH 1/6] validation script for bpc to cbio mapping file --- validation/config.yaml | 50 +++++++++ validation/validate_map.py | 202 +++++++++++++++++++++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 validation/config.yaml create mode 100644 validation/validate_map.py diff --git a/validation/config.yaml b/validation/config.yaml new file mode 100644 index 0000000..5a66041 --- /dev/null +++ b/validation/config.yaml @@ -0,0 +1,50 @@ +dataset: + 'Cancer-level dataset': + id: syn22296816 + file: ca_dx_derived.csv + 'Patient-level dataset': + id: syn22296817 + file: pt_derived.csv + 'Regimen-Cancer level dataset': + id: syn22296818 + file: ca_drugs_derived.csv + 'Imaging-level dataset': + id: syn22296819 + file: prissmm_image_derived.csv + 'Pathology-report level dataset': + id: syn22296820 + file: prissmm_path_derived.csv + 'Med Onc Note level dataset': + id: syn22296822 + file: prissmm_md_derived.csv + 'Cancer panel test level dataset': + id: syn22296823 + file: cpt_derived.csv + 'Cancer-level index dataset': + id: syn22314486 + file: ca_dx_derived_index.csv + 'Cancer-level non-index dataset': + id: syn22314497 + file: ca_dx_derived_non_index.csv + 'Hemeonc dataset': + id: syn23561688 + file: hemonc_mapping_cbio.csv + 'PRISSMM Tumor Marker level dataset': + id: syn23561700 + file: prissmm_tm_derived.csv + 'Cancer-Directed Radiation Therapy dataset': + id: syn25931923 + file: ca_radtx_derived.csv +check: + 1: + function: _check_code_name_empty + implemented: 1 + deprecated: 0 + description: Code is empty. + request: Please remove the row or fill in the code name. + 2: + function: _check_code_name_absent + implemented: 1 + deprecated: 0 + description: Code does not exist in associated dataset. + request: Please check the code name and associated dataset. \ No newline at end of file diff --git a/validation/validate_map.py b/validation/validate_map.py new file mode 100644 index 0000000..4f68438 --- /dev/null +++ b/validation/validate_map.py @@ -0,0 +1,202 @@ +''' +Description: Validate the BPC to cBioPortal mapping file. +Author: Haley Hunter-Zinck +Date: 2022-01-27 +''' + +import argparse +import logging +import pandas as pd +import yaml +import logging +import re + +import synapseclient +from synapseclient import Synapse +from synapseclient.core.exceptions import ( + SynapseAuthenticationError, + SynapseNoCredentialsError, +) + +def _check_code_name_empty(df: pd.DataFrame, syn: Synapse, config: dict) -> list: + """Check for any code that is empty. + Args: + df: dataframe representing map + syn: Synapse object + config: configuration parameters + Returns: + dataframe with metadata on any empty codes. + """ + empty = df.loc[pd.isna(df['code'])]['code'] + return(list(empty)) + +def _check_code_name_absent(df: pd.DataFrame, syn: Synapse, config: dict) -> list: + """Check for any code that is not code name that + does not appear in its associated data file. + Args: + df: dataframe representing map + syn: Synapse object + config: configuration parameters + Returns: + dataframe with metadata on any missing codes. + """ + absent = [] + for dataset in config['dataset']: + data = pd.read_csv(syn.get(config['dataset'][dataset]['id'])['path'], low_memory=False) + code_data = data.columns + + # get codes associated with the dataset and of types derived or curated + code_map = list(df.loc[((df['dataset'] == dataset) & + ((df['data_type'].str.lower() == 'derived') | (df['data_type'].str.lower() == 'curated')))]['code']) + + # do not check wildcard code names or NA code names + code_remove = [] + for code in code_map: + if bool(re.match(r'^.+[*]$', str(code).strip())): + code_remove.append(code) + elif pd.isna(code): + code_remove.append(code) + for code in code_remove: + code_map.remove(code) + + absent.extend(list(set(code_map) - set(code_data))) + return(absent) + + +def _format_result(codes: list, config: dict, check_no: int): + """Format output for interpretable log file. + Args: + df: dataframe representing map + config: configuration parameters + check_no: check number for which to format results + Returns: + dataframe with additional metadata on any errors. + """ + formatted = pd.DataFrame() + formatted['code'] = codes + formatted['check_no'] = str(check_no) + formatted['description'] = config['check'][check_no]['description'] + formatted['action'] = config['check'][check_no]['request'] + return(formatted) + +def validate_map(synapse_id: str, syn: Synapse, config: dict, version: int) -> pd.DataFrame: + """Run all implemented checks on mapping file. + Args: + synapse_id: Synapse ID of mapping file + syn: Synapse object + config: configuration parameters + version: Version number of Synapse ID + Returns: + dataframe with additional metadata on any errors. + """ + + errors = pd.DataFrame() + df = pd.DataFrame() + if (version == 'None'): + df = pd.read_csv(syn.get(synapse_id)['path']) + else: + df = pd.read_csv(syn.get(synapse_id, version=version)['path']) + + for check_no in config['check']: + + logging.info(f'Check {check_no}...') + + if (config['check'][check_no]['implemented'] and not config['check'][check_no]['deprecated']): + function_name = config['check'][check_no]['function'] + result = eval(function_name + "(df, syn, config)") + errors = errors.append(_format_result(result, config, check_no)) + logging.info(f' Found {errors.shape[0]} error(s).') + else: + logging.info(' Check deprecated or not implemented.') + + errors.insert(0, 'issue', range(1, errors.shape[0] + 1, 1)) + + return(errors) + +def build_parser(): + parser = argparse.ArgumentParser( + description="Checks validity of BPC to cBioPortal mapping file " + ) + parser.add_argument( + "synapse_id", + metavar="SYNAPSE_ID", + type=str, + help="Synapse ID of mapping file", + ) + parser.add_argument( + "--version", + "-v", + metavar="VERSION", + type=str, + default="None", + help="Synapse entity version number " "(default: current)", + ) + parser.add_argument( + "--outfile", + "-o", + metavar="OUTFILE", + type=str, + default="output.csv", + help="Name of output file " "(default: %(default)s)", + ) + parser.add_argument( + "--log", + "-l", + metavar="LEVEL", + type=str, + choices=["debug", "info", "warning", "error"], + default="error", + help="Set logging output level " "(default: %(default)s)", + ) + return parser + + +def read_config(file: str) -> dict: + config = None + with open(file, "r") as stream: + try: + config = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + return(config) + + +def synapse_login(synapse_config=synapseclient.client.CONFIG_FILE): + """Login to Synapse + Args: + synapse_config: Path to synapse configuration file. + Defaults to ~/.synapseConfig + Returns: + Synapse connection + """ + try: + syn = synapseclient.Synapse(skip_checks=True, configPath=synapse_config) + syn.login(silent=True) + except (SynapseNoCredentialsError, SynapseAuthenticationError): + raise ValueError( + "Login error: please make sure you have correctly " + "configured your client. Instructions here: " + "https://help.synapse.org/docs/Client-Configuration.1985446156.html. " + "You can also create a Synapse Personal Access Token and set it " + "as an environmental variable: " + "SYNAPSE_AUTH_TOKEN=''" + ) + return syn + + +def main(): + + args = build_parser().parse_args() + config = read_config('config.yaml') + syn = synapse_login() + + numeric_level = getattr(logging, args.log.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError("Invalid log level: %s" % args.log) + logging.basicConfig(level=numeric_level) + + res = validate_map(args.synapse_id, syn, config, args.version) + res.to_csv(args.outfile, index=False) + +if __name__ == "__main__": + main() From 8ad4a3260b9c3601f04688a2f082bb238762b8bc Mon Sep 17 00:00:00 2001 From: Haley Hunter-Zinck <17149604+hhunterzinck@users.noreply.github.com> Date: Fri, 28 Jan 2022 08:36:21 -0800 Subject: [PATCH 2/6] Update validation/validate_map.py Co-authored-by: Thomas Yu --- validation/validate_map.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/validation/validate_map.py b/validation/validate_map.py index 4f68438..44b2dfc 100644 --- a/validation/validate_map.py +++ b/validation/validate_map.py @@ -6,17 +6,17 @@ import argparse import logging -import pandas as pd -import yaml -import logging import re +import pandas as pd import synapseclient from synapseclient import Synapse from synapseclient.core.exceptions import ( SynapseAuthenticationError, SynapseNoCredentialsError, ) +import yaml + def _check_code_name_empty(df: pd.DataFrame, syn: Synapse, config: dict) -> list: """Check for any code that is empty. From 093fb354b4f89f237458e4b341a6312f1bedfbe9 Mon Sep 17 00:00:00 2001 From: hhunterzinck Date: Fri, 28 Jan 2022 16:40:03 +0000 Subject: [PATCH 3/6] add double newline between functions and black --- validation/validate_map.py | 204 ++++++++++++++++++++----------------- 1 file changed, 112 insertions(+), 92 deletions(-) diff --git a/validation/validate_map.py b/validation/validate_map.py index 44b2dfc..349a7bc 100644 --- a/validation/validate_map.py +++ b/validation/validate_map.py @@ -1,8 +1,8 @@ -''' +""" Description: Validate the BPC to cBioPortal mapping file. Author: Haley Hunter-Zinck Date: 2022-01-27 -''' +""" import argparse import logging @@ -19,68 +19,83 @@ def _check_code_name_empty(df: pd.DataFrame, syn: Synapse, config: dict) -> list: - """Check for any code that is empty. + """Check for any code that is empty. Args: df: dataframe representing map syn: Synapse object config: configuration parameters Returns: dataframe with metadata on any empty codes. - """ - empty = df.loc[pd.isna(df['code'])]['code'] - return(list(empty)) - + """ + empty = df.loc[pd.isna(df["code"])]["code"] + return list(empty) + + def _check_code_name_absent(df: pd.DataFrame, syn: Synapse, config: dict) -> list: - """Check for any code that is not code name that - does not appear in its associated data file. + """Check for any code that is not code name that + does not appear in its associated data file. Args: df: dataframe representing map syn: Synapse object config: configuration parameters Returns: dataframe with metadata on any missing codes. - """ - absent = [] - for dataset in config['dataset']: - data = pd.read_csv(syn.get(config['dataset'][dataset]['id'])['path'], low_memory=False) - code_data = data.columns - - # get codes associated with the dataset and of types derived or curated - code_map = list(df.loc[((df['dataset'] == dataset) & - ((df['data_type'].str.lower() == 'derived') | (df['data_type'].str.lower() == 'curated')))]['code']) - - # do not check wildcard code names or NA code names - code_remove = [] - for code in code_map: - if bool(re.match(r'^.+[*]$', str(code).strip())): - code_remove.append(code) - elif pd.isna(code): - code_remove.append(code) - for code in code_remove: - code_map.remove(code) - - absent.extend(list(set(code_map) - set(code_data))) - return(absent) + """ + absent = [] + for dataset in config["dataset"]: + data = pd.read_csv( + syn.get(config["dataset"][dataset]["id"])["path"], low_memory=False + ) + code_data = data.columns + + # get codes associated with the dataset and of types derived or curated + code_map = list( + df.loc[ + ( + (df["dataset"] == dataset) + & ( + (df["data_type"].str.lower() == "derived") + | (df["data_type"].str.lower() == "curated") + ) + ) + ]["code"] + ) + + # do not check wildcard code names or NA code names + code_remove = [] + for code in code_map: + if bool(re.match(r"^.+[*]$", str(code).strip())): + code_remove.append(code) + elif pd.isna(code): + code_remove.append(code) + for code in code_remove: + code_map.remove(code) + + absent.extend(list(set(code_map) - set(code_data))) + return absent def _format_result(codes: list, config: dict, check_no: int): - """Format output for interpretable log file. + """Format output for interpretable log file. Args: df: dataframe representing map config: configuration parameters check_no: check number for which to format results Returns: dataframe with additional metadata on any errors. - """ - formatted = pd.DataFrame() - formatted['code'] = codes - formatted['check_no'] = str(check_no) - formatted['description'] = config['check'][check_no]['description'] - formatted['action'] = config['check'][check_no]['request'] - return(formatted) - -def validate_map(synapse_id: str, syn: Synapse, config: dict, version: int) -> pd.DataFrame: - """Run all implemented checks on mapping file. + """ + formatted = pd.DataFrame() + formatted["code"] = codes + formatted["check_no"] = str(check_no) + formatted["description"] = config["check"][check_no]["description"] + formatted["action"] = config["check"][check_no]["request"] + return formatted + + +def validate_map( + synapse_id: str, syn: Synapse, config: dict, version: int +) -> pd.DataFrame: + """Run all implemented checks on mapping file. Args: synapse_id: Synapse ID of mapping file syn: Synapse object @@ -88,58 +103,62 @@ def validate_map(synapse_id: str, syn: Synapse, config: dict, version: int) -> p version: Version number of Synapse ID Returns: dataframe with additional metadata on any errors. - """ - - errors = pd.DataFrame() - df = pd.DataFrame() - if (version == 'None'): - df = pd.read_csv(syn.get(synapse_id)['path']) - else: - df = pd.read_csv(syn.get(synapse_id, version=version)['path']) - - for check_no in config['check']: - - logging.info(f'Check {check_no}...') - - if (config['check'][check_no]['implemented'] and not config['check'][check_no]['deprecated']): - function_name = config['check'][check_no]['function'] - result = eval(function_name + "(df, syn, config)") - errors = errors.append(_format_result(result, config, check_no)) - logging.info(f' Found {errors.shape[0]} error(s).') + """ + + errors = pd.DataFrame() + df = pd.DataFrame() + if version == "None": + df = pd.read_csv(syn.get(synapse_id)["path"]) else: - logging.info(' Check deprecated or not implemented.') - - errors.insert(0, 'issue', range(1, errors.shape[0] + 1, 1)) - - return(errors) + df = pd.read_csv(syn.get(synapse_id, version=version)["path"]) + + for check_no in config["check"]: + + logging.info(f"Check {check_no}...") + + if ( + config["check"][check_no]["implemented"] + and not config["check"][check_no]["deprecated"] + ): + function_name = config["check"][check_no]["function"] + result = eval(function_name + "(df, syn, config)") + errors = errors.append(_format_result(result, config, check_no)) + logging.info(f" Found {errors.shape[0]} error(s).") + else: + logging.info(" Check deprecated or not implemented.") + + errors.insert(0, "issue", range(1, errors.shape[0] + 1, 1)) + + return errors + def build_parser(): - parser = argparse.ArgumentParser( - description="Checks validity of BPC to cBioPortal mapping file " - ) - parser.add_argument( - "synapse_id", - metavar="SYNAPSE_ID", - type=str, - help="Synapse ID of mapping file", - ) - parser.add_argument( + parser = argparse.ArgumentParser( + description="Checks validity of BPC to cBioPortal mapping file " + ) + parser.add_argument( + "synapse_id", + metavar="SYNAPSE_ID", + type=str, + help="Synapse ID of mapping file", + ) + parser.add_argument( "--version", "-v", metavar="VERSION", type=str, default="None", help="Synapse entity version number " "(default: current)", - ) - parser.add_argument( + ) + parser.add_argument( "--outfile", "-o", metavar="OUTFILE", type=str, default="output.csv", help="Name of output file " "(default: %(default)s)", - ) - parser.add_argument( + ) + parser.add_argument( "--log", "-l", metavar="LEVEL", @@ -148,17 +167,17 @@ def build_parser(): default="error", help="Set logging output level " "(default: %(default)s)", ) - return parser - - + return parser + + def read_config(file: str) -> dict: - config = None - with open(file, "r") as stream: - try: - config = yaml.safe_load(stream) - except yaml.YAMLError as exc: - print(exc) - return(config) + config = None + with open(file, "r") as stream: + try: + config = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + return config def synapse_login(synapse_config=synapseclient.client.CONFIG_FILE): @@ -183,20 +202,21 @@ def synapse_login(synapse_config=synapseclient.client.CONFIG_FILE): ) return syn - + def main(): args = build_parser().parse_args() - config = read_config('config.yaml') + config = read_config("config.yaml") syn = synapse_login() numeric_level = getattr(logging, args.log.upper(), None) if not isinstance(numeric_level, int): raise ValueError("Invalid log level: %s" % args.log) logging.basicConfig(level=numeric_level) - + res = validate_map(args.synapse_id, syn, config, args.version) res.to_csv(args.outfile, index=False) + if __name__ == "__main__": main() From 759a21db0f0464e43f9d0c8b3523feb7a6b9f596 Mon Sep 17 00:00:00 2001 From: hhunterzinck Date: Sat, 29 Jan 2022 01:23:08 +0000 Subject: [PATCH 4/6] add function map and remove eval call --- validation/validate_map.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/validation/validate_map.py b/validation/validate_map.py index 349a7bc..150a98f 100644 --- a/validation/validate_map.py +++ b/validation/validate_map.py @@ -75,7 +75,7 @@ def _check_code_name_absent(df: pd.DataFrame, syn: Synapse, config: dict) -> lis return absent -def _format_result(codes: list, config: dict, check_no: int): +def _format_result(codes: list, config: dict, check_no: int) -> pd.DataFrame: """Format output for interpretable log file. Args: df: dataframe representing map @@ -92,6 +92,14 @@ def _format_result(codes: list, config: dict, check_no: int): return formatted +def _create_function_map() -> dict: + fxns = { + "_check_code_name_absent": _check_code_name_absent, + "_check_code_name_empty": _check_code_name_empty + } + return fxns + + def validate_map( synapse_id: str, syn: Synapse, config: dict, version: int ) -> pd.DataFrame: @@ -107,6 +115,7 @@ def validate_map( errors = pd.DataFrame() df = pd.DataFrame() + fxns = _create_function_map() if version == "None": df = pd.read_csv(syn.get(synapse_id)["path"]) else: @@ -120,8 +129,8 @@ def validate_map( config["check"][check_no]["implemented"] and not config["check"][check_no]["deprecated"] ): - function_name = config["check"][check_no]["function"] - result = eval(function_name + "(df, syn, config)") + fxn_name = config["check"][check_no]["function"] + result = fxns[fxn_name](df, syn, config) errors = errors.append(_format_result(result, config, check_no)) logging.info(f" Found {errors.shape[0]} error(s).") else: @@ -161,7 +170,6 @@ def build_parser(): parser.add_argument( "--log", "-l", - metavar="LEVEL", type=str, choices=["debug", "info", "warning", "error"], default="error", From 1730e6bddb036e7b77652dcd9ff10bcee7729c26 Mon Sep 17 00:00:00 2001 From: hhunterzinck Date: Sat, 29 Jan 2022 01:55:28 +0000 Subject: [PATCH 5/6] add test to check function map and config file list --- validation/config.yaml | 4 ++-- validation/test_validate.py | 18 ++++++++++++++++++ validation/validate_map.py | 24 +++++++++++++----------- 3 files changed, 33 insertions(+), 13 deletions(-) create mode 100644 validation/test_validate.py diff --git a/validation/config.yaml b/validation/config.yaml index 5a66041..14969fb 100644 --- a/validation/config.yaml +++ b/validation/config.yaml @@ -37,13 +37,13 @@ dataset: file: ca_radtx_derived.csv check: 1: - function: _check_code_name_empty + function: check_code_name_empty implemented: 1 deprecated: 0 description: Code is empty. request: Please remove the row or fill in the code name. 2: - function: _check_code_name_absent + function: check_code_name_absent implemented: 1 deprecated: 0 description: Code does not exist in associated dataset. diff --git a/validation/test_validate.py b/validation/test_validate.py new file mode 100644 index 0000000..531fb95 --- /dev/null +++ b/validation/test_validate.py @@ -0,0 +1,18 @@ +"""Test validate map""" +import yaml + +from validate_map import * + + +def test__function_map(): + """Test that all functions referenced in the config file are listed in the function map.""" + config = read_config("config.yaml") + fxn_map = create_function_map() + + fxn_config = [] + for check in config["check"]: + fxn_config.append(config["check"][check]["function"]) + + config_not_map = set(fxn_config) - set(fxn_map.keys()) + + assert len(config_not_map) == 0 diff --git a/validation/validate_map.py b/validation/validate_map.py index 150a98f..f6068a2 100644 --- a/validation/validate_map.py +++ b/validation/validate_map.py @@ -18,7 +18,7 @@ import yaml -def _check_code_name_empty(df: pd.DataFrame, syn: Synapse, config: dict) -> list: +def check_code_name_empty(df: pd.DataFrame, syn: Synapse, config: dict) -> list: """Check for any code that is empty. Args: df: dataframe representing map @@ -31,7 +31,7 @@ def _check_code_name_empty(df: pd.DataFrame, syn: Synapse, config: dict) -> list return list(empty) -def _check_code_name_absent(df: pd.DataFrame, syn: Synapse, config: dict) -> list: +def check_code_name_absent(df: pd.DataFrame, syn: Synapse, config: dict) -> list: """Check for any code that is not code name that does not appear in its associated data file. Args: @@ -75,7 +75,7 @@ def _check_code_name_absent(df: pd.DataFrame, syn: Synapse, config: dict) -> lis return absent -def _format_result(codes: list, config: dict, check_no: int) -> pd.DataFrame: +def format_result(codes: list, config: dict, check_no: int) -> pd.DataFrame: """Format output for interpretable log file. Args: df: dataframe representing map @@ -92,12 +92,12 @@ def _format_result(codes: list, config: dict, check_no: int) -> pd.DataFrame: return formatted -def _create_function_map() -> dict: - fxns = { - "_check_code_name_absent": _check_code_name_absent, - "_check_code_name_empty": _check_code_name_empty - } - return fxns +def create_function_map() -> dict: + fxns = { + "check_code_name_absent": check_code_name_absent, + "check_code_name_empty": check_code_name_empty, + } + return fxns def validate_map( @@ -115,7 +115,7 @@ def validate_map( errors = pd.DataFrame() df = pd.DataFrame() - fxns = _create_function_map() + fxns = create_function_map() if version == "None": df = pd.read_csv(syn.get(synapse_id)["path"]) else: @@ -131,7 +131,7 @@ def validate_map( ): fxn_name = config["check"][check_no]["function"] result = fxns[fxn_name](df, syn, config) - errors = errors.append(_format_result(result, config, check_no)) + errors = errors.append(format_result(result, config, check_no)) logging.info(f" Found {errors.shape[0]} error(s).") else: logging.info(" Check deprecated or not implemented.") @@ -225,6 +225,8 @@ def main(): res = validate_map(args.synapse_id, syn, config, args.version) res.to_csv(args.outfile, index=False) + logging.info(f"Output written to '{args.outfile}'") + if __name__ == "__main__": main() From d6fc1340d39d8e35e819d8dd2963ed3548fa9f51 Mon Sep 17 00:00:00 2001 From: Haley Hunter-Zinck Date: Fri, 28 Jan 2022 18:37:31 -0800 Subject: [PATCH 6/6] change directory name --- {validation => scripts}/config.yaml | 0 {validation => scripts}/test_validate.py | 0 {validation => scripts}/validate_map.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {validation => scripts}/config.yaml (100%) rename {validation => scripts}/test_validate.py (100%) rename {validation => scripts}/validate_map.py (100%) diff --git a/validation/config.yaml b/scripts/config.yaml similarity index 100% rename from validation/config.yaml rename to scripts/config.yaml diff --git a/validation/test_validate.py b/scripts/test_validate.py similarity index 100% rename from validation/test_validate.py rename to scripts/test_validate.py diff --git a/validation/validate_map.py b/scripts/validate_map.py similarity index 100% rename from validation/validate_map.py rename to scripts/validate_map.py