From ccfcecc0128af428353a3dead51752278eeb6e61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antony=20Le=20B=C3=A9chec?= Date: Mon, 13 Jan 2025 22:05:39 +0100 Subject: [PATCH] Add samples struct column #351 --- howard/objects/variants.py | 104 +++++++- tests/test_variants_annotations_view.py | 325 +++++++++++++++++++++++- 2 files changed, 421 insertions(+), 8 deletions(-) diff --git a/howard/objects/variants.py b/howard/objects/variants.py index a701cc4..f710720 100644 --- a/howard/objects/variants.py +++ b/howard/objects/variants.py @@ -12512,6 +12512,7 @@ def create_annotations_view( fields_not_exists: bool = True, info_prefix_column: str = None, info_struct_column: str = None, + sample_struct_column: str = None, drop_view: bool = False, fields_to_rename: dict = None, fields_forced_as_varchar: bool = False, @@ -12574,6 +12575,12 @@ def create_annotations_view( column for further processing or analysis (e.g. "INFOS" or "annotations"). If not provided (None), the function will not genereate the column. Defaults to None :type info_struct_column: str + :param sample_struct_column: The `sample_struct_column` parameter in the `create_annotations_view` + function is used to specify the name of the column that will contain the extracted formats from + the samples columns in the view. This column will hold the structured data extracted from all + samples column for further processing or analysis (e.g. "SAMPLES" or "genotypes"). If not provided (None), + the function will not genereate the column. Defaults to None + :type sample_struct_column: str :param drop_view: The `drop_view` parameter in the `create_annotations_view` function is a boolean flag that determines whether to drop the existing view with the same name before creating a new view. If set to `True`, the function will drop the existing view before creating a new view with @@ -12631,6 +12638,15 @@ def create_annotations_view( if fields_to_rename is None: fields_to_rename = {} + # If Samples structured columns + if sample_struct_column: + + # Get format + formats = list(header.formats.keys()) + + # Get samples + samples = list(header.samples) + log.debug( f"Create '{view}' view (as '{view_type}') from table '{table}' with {len(fields)} fields" ) @@ -12649,6 +12665,12 @@ def create_annotations_view( fields_needed = ["#CHROM", "POS", "REF", "ALT"] # list(table_describe.get("column_name")) + # Check needed fieds + for field in fields_needed: + if field not in list(table_describe.get("column_name")): + msg_err = f"Field '{field}' is needed, but not in file" + raise ValueError(msg_err) + # Create fields for annotation view extracted from INFO column in table variants (with regexp_replace like in rename_info_fields), with column type from VCF header fields_columns = [] fields_columns_annotations_struct = [] @@ -12669,7 +12691,9 @@ def create_annotations_view( fields_columns.append(f""" "{field}" AS '{prefix}{field_to_rename}' """) # Fields in header - elif field in header.infos: + elif field in header.infos and "INFO" in list( + table_describe.get("column_name") + ): # Field info field_infos = header.infos.get(field, None) @@ -12734,6 +12758,82 @@ def create_annotations_view( msg_err = f"Field '{field}' is not found (in table or header): '{field}' will be set to NULL" log.debug(msg=msg_err) + # Samples struct + + # Init + samples_format_struct_clause = "" + + # If samples and struct as option + if sample_struct_column and len(samples): + + # Struct by samples + samples_format_struct = [] + + # Format info + format_infos = header.formats + + # For each sample + for sample in samples: + + # Struct by format + sample_format_struct = [] + + # For each format + for format in formats: + + # Format cast and list + format_cast = "" + format_list = False + format_cast = code_type_map_to_sql.get( + format_infos.get(format).type, "VARCHAR" + ) + if format_infos.get(format).num != 1: + format_list = True + + # If format is a list + if format_list: + sample_format_struct.append( + f""" + "{format}":= + list_transform( + string_split( + NULLIF( + string_split("{sample}", ':')[list_position(string_split("FORMAT", ':'), '{format}')] + , '' + ) + , ',') + , x -> CASE WHEN x = '.' OR x = '' THEN NULL ELSE x END + )::{format_cast}[] + """ + ) + # If format is NOT a list + else: + sample_format_struct.append( + f""" + "{format}":= + COALESCE( + NULLIF( + regexp_replace( + string_split("{sample}", ':')[list_position(string_split("FORMAT", ':'), '{format}')] + , '^\\.$', '' + ) + , '' + ) + )::{format_cast} + """ + ) + + # Add struct of the sample + samples_format_struct.append( + f""" + "{sample}":= STRUCT_PACK({", ".join(sample_format_struct)}) + """ + ) + + samples_format_struct_clause = f""" + , STRUCT_PACK({", ".join(samples_format_struct)}) AS {sample_struct_column} + """ + # Combine fields into a STRUCT if info_struct_column and len(fields_columns_annotations_struct): annotations_column_annotations_struct = f""" @@ -12757,7 +12857,7 @@ def create_annotations_view( # Query select query_select = f""" SELECT - {', '.join([f'"{field}"' for field in fields_needed])} {annotations_column_annotations_columns} {annotations_column_annotations_struct} + {', '.join([f'"{field}"' for field in fields_needed])} {annotations_column_annotations_columns} {annotations_column_annotations_struct} {samples_format_struct_clause} FROM {table} {limit_clause} diff --git a/tests/test_variants_annotations_view.py b/tests/test_variants_annotations_view.py index a6f2120..77405aa 100644 --- a/tests/test_variants_annotations_view.py +++ b/tests/test_variants_annotations_view.py @@ -6,21 +6,256 @@ pytest tests/ Coverage: -coverage run -m pytest tests/test_variants_transcripts.py -x -vv --log-cli-level=DEBUG --capture=tee-sys +coverage run -m pytest tests/test_variants_annotations_view.py -x -vv --log-cli-level=DEBUG --capture=tee-sys coverage report --include=howard/* -m """ -import logging as log +# import logging as log from tempfile import TemporaryDirectory -import pytest -import vcf -import os +import pytest # type: ignore + from howard.functions.commons import remove_if_exists, get_file_format from howard.objects.variants import Variants from test_needed import tests_folder, tests_config, tests_data_folder +def test_create_annotations_view_chrom_pos_ref_alt(): + """ """ + + with TemporaryDirectory(dir=tests_folder) as tmp_dir: + + # Init files + input_vcf = tests_data_folder + "/example.chrom.pos.ref.alt.vcf" + output_vcf = f"{tmp_dir}/output.vcf.gz" + + # config dict + config = tests_config + # config["access"] = "RO" + + # Construct param dict + param = {} + + # Create object + variants = Variants( + conn=None, + input=input_vcf, + output=output_vcf, + config=config, + param=param, + load=True, + ) + + annotations_view_name = "annotations_view_test" + + # TEST 0 + ########## + + # Create annotations view + annotations_view_name_result = variants.create_annotations_view( + view=annotations_view_name, table="variants", fields=None + ) + + # Check annotations view name + assert annotations_view_name == annotations_view_name_result + + # Check annotations_view content + annotations_view_select = variants.get_query_to_df( + query=f""" + SELECT * + FROM {annotations_view_name} + LIMIT 100 + """ + ) + # Check shape + assert annotations_view_select.shape == (7, 4) + assert annotations_view_select.columns.to_list() == [ + "#CHROM", + "POS", + "REF", + "ALT", + ] + + # TEST 1 + ########## + # Generates columns from fields + # Not dropped! Same than before + + # Create annotations view + annotations_view_name_result = variants.create_annotations_view( + view=annotations_view_name, + table="variants", + fields=None, + info_prefix_column="", + ) + + # Check annotations view name + assert annotations_view_name == annotations_view_name_result + + # Check annotations_view content + annotations_view_select = variants.get_query_to_df( + query=f""" + SELECT * + FROM {annotations_view_name} + LIMIT 100 + """ + ) + # Check shape + assert annotations_view_select.shape == (7, 4) + assert annotations_view_select.columns.to_list() == [ + "#CHROM", + "POS", + "REF", + "ALT", + ] + + # TEST 2 + ########## + # Add specific fields + # Without drop + + # Create annotations view + fields = [ + "CLNSIG", + "SIFT", + ] + annotations_view_name_result = variants.create_annotations_view( + view=annotations_view_name, + table="variants", + fields=fields, + info_prefix_column="", + ) + + # Check annotations view name + assert annotations_view_name == annotations_view_name_result + + # Check annotations_view content + annotations_view_select = variants.get_query_to_df( + query=f""" + SELECT * + FROM {annotations_view_name} + LIMIT 100 + """ + ) + # Check shape + assert annotations_view_select.shape == (7, 4) + assert annotations_view_select.columns.to_list() == [ + "#CHROM", + "POS", + "REF", + "ALT", + ] + + # TEST 3 + ########## + # Add specific fields + # With drop + + # Create annotations view + fields = [ + "CLNSIG", + "SIFT", + ] + annotations_view_name_result = variants.create_annotations_view( + view=annotations_view_name, + table="variants", + fields=fields, + info_prefix_column="", + drop_view=True, + ) + + # Check annotations view name + assert annotations_view_name == annotations_view_name_result + + # Check annotations_view content + annotations_view_select = variants.get_query_to_df( + query=f""" + SELECT * + FROM {annotations_view_name} + LIMIT 100 + """ + ) + # Check shape + assert annotations_view_select.shape == (7, 6) + assert annotations_view_select.columns.to_list() == [ + "#CHROM", + "POS", + "REF", + "ALT", + "CLNSIG", + "SIFT", + ] + + # TEST 4 + ########## + # Add specific fields + # Add specific fields needed + # With drop + + # Create annotations view + fields = [ + "CLNSIG", + "SIFT", + ] + fields_needed = ["#CHROM", "POS", "ID", "REF", "ALT", "FILTER"] + + with pytest.raises(ValueError) as e: + annotations_view_name_result = variants.create_annotations_view( + view=annotations_view_name, + table="variants", + fields=fields, + info_prefix_column="", + fields_needed=fields_needed, + drop_view=True, + ) + assert str(e.value) == f"Field 'ID' is needed, but not in file" + + # TEST 5 + ########## + # Add INFO struct column + + # Create annotations view + fields = ["CLNSIG", "SIFT", "FIELD_THAT_NOT_EXISTS"] + fields_needed = ["#CHROM", "POS", "ID", "REF", "ALT", "FILTER"] + info_struct_column = "INFOS" + sample_struct_column = "SAMPLES" + annotations_view_name_result = variants.create_annotations_view( + view=annotations_view_name, + table="variants", + fields=fields, + info_prefix_column="INFOS_", + info_struct_column=info_struct_column, + sample_struct_column=sample_struct_column, + # fields_needed=fields_needed, + # fields_needed=None, + fields_needed_all=True, + fields_not_exists=False, + detect_type_list=True, + drop_view=True, + ) + + # Check annotations view name + assert annotations_view_name == annotations_view_name_result + + # Check annotations_view content + annotations_view_select = variants.get_query_to_df( + query=f""" + SELECT * + FROM {annotations_view_name} + LIMIT 100 + """ + ) + # log.debug(annotations_view_select) + # Check shape + assert annotations_view_select.shape == (7, 4) + assert annotations_view_select.columns.to_list() == [ + "#CHROM", + "POS", + "REF", + "ALT", + ] + + def test_create_annotations_view(): """ """ @@ -477,7 +712,6 @@ def test_create_annotations_view(): ) # Check shape assert annotations_view_select.shape == (7, 8) - log.debug(annotations_view_select.columns.to_list()) assert annotations_view_select.columns.to_list() == [ "#CHROM", "POS", @@ -581,3 +815,82 @@ def test_create_annotations_view(): "FILTER", "INFOS", ] + + # TEST 12 + ########## + # Add INFO struct column + + # Create annotations view + fields = ["CLNSIG", "SIFT", "FIELD_THAT_NOT_EXISTS"] + fields_needed = ["#CHROM", "POS", "ID", "REF", "ALT", "FILTER"] + info_struct_column = "INFOS" + sample_struct_column = "SAMPLES" + annotations_view_name_result = variants.create_annotations_view( + view=annotations_view_name, + table="variants", + fields=fields, + info_struct_column=info_struct_column, + sample_struct_column=sample_struct_column, + # fields_needed=fields_needed, + fields_needed=None, + fields_needed_all=True, + fields_not_exists=False, + detect_type_list=True, + drop_view=True, + ) + + # Check annotations view name + assert annotations_view_name == annotations_view_name_result + + # Check annotations_view content + annotations_view_select = variants.get_query_to_df( + query=f""" + SELECT * + FROM {annotations_view_name} + LIMIT 100 + """ + ) + # log.debug(annotations_view_select) + # Check shape + assert annotations_view_select.shape == (7, 15) + assert annotations_view_select.columns.to_list() == [ + "#CHROM", + "POS", + "ID", + "REF", + "ALT", + "QUAL", + "FILTER", + "INFO", + "FORMAT", + "sample1", + "sample2", + "sample3", + "sample4", + "INFOS", + "SAMPLES", + ] + + # Check struct + annotations_view_select = variants.get_query_to_df( + query=f""" + SELECT "#CHROM", POS, REF, ALT, FORMAT, SAMPLES.sample1, SAMPLES.sample1.AD[2]/(SAMPLES.sample1.AD[1]+SAMPLES.sample1.AD[2]) AS 'sample1_VAF' + FROM {annotations_view_name} + WHERE SAMPLES.sample1.GQ > 90 + AND SAMPLES.sample1.DP > 300 + AND sample1_VAF >= 0 + LIMIT 100 + """ + ) + # log.debug(annotations_view_select.to_string()) + # Check shape + assert annotations_view_select.shape == (6, 7) + assert annotations_view_select.columns.to_list() == [ + "#CHROM", + "POS", + "REF", + "ALT", + "FORMAT", + "sample1", + "sample1_VAF", + ]