Skip to content

Commit

Permalink
Merge pull request #361 from bioinfo-chru-strasbourg/improve_annotati…
Browse files Browse the repository at this point in the history
…ons_view

Add samples struct column #351
  • Loading branch information
antonylebechec authored Jan 13, 2025
2 parents ae2cd94 + ccfcecc commit 20bc553
Show file tree
Hide file tree
Showing 2 changed files with 421 additions and 8 deletions.
104 changes: 102 additions & 2 deletions howard/objects/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12512,6 +12512,7 @@ def create_annotations_view(
fields_not_exists: bool = True,
info_prefix_column: str = None,
info_struct_column: str = None,
sample_struct_column: str = None,
drop_view: bool = False,
fields_to_rename: dict = None,
fields_forced_as_varchar: bool = False,
Expand Down Expand Up @@ -12574,6 +12575,12 @@ def create_annotations_view(
column for further processing or analysis (e.g. "INFOS" or "annotations"). If not provided (None),
the function will not genereate the column. Defaults to None
:type info_struct_column: str
:param sample_struct_column: The `sample_struct_column` parameter in the `create_annotations_view`
function is used to specify the name of the column that will contain the extracted formats from
the samples columns in the view. This column will hold the structured data extracted from all
samples column for further processing or analysis (e.g. "SAMPLES" or "genotypes"). If not provided (None),
the function will not genereate the column. Defaults to None
:type sample_struct_column: str
:param drop_view: The `drop_view` parameter in the `create_annotations_view` function is a boolean
flag that determines whether to drop the existing view with the same name before creating a new
view. If set to `True`, the function will drop the existing view before creating a new view with
Expand Down Expand Up @@ -12631,6 +12638,15 @@ def create_annotations_view(
if fields_to_rename is None:
fields_to_rename = {}

# If Samples structured columns
if sample_struct_column:

# Get format
formats = list(header.formats.keys())

# Get samples
samples = list(header.samples)

log.debug(
f"Create '{view}' view (as '{view_type}') from table '{table}' with {len(fields)} fields"
)
Expand All @@ -12649,6 +12665,12 @@ def create_annotations_view(
fields_needed = ["#CHROM", "POS", "REF", "ALT"]
# list(table_describe.get("column_name"))

# Check needed fieds
for field in fields_needed:
if field not in list(table_describe.get("column_name")):
msg_err = f"Field '{field}' is needed, but not in file"
raise ValueError(msg_err)

# Create fields for annotation view extracted from INFO column in table variants (with regexp_replace like in rename_info_fields), with column type from VCF header
fields_columns = []
fields_columns_annotations_struct = []
Expand All @@ -12669,7 +12691,9 @@ def create_annotations_view(
fields_columns.append(f""" "{field}" AS '{prefix}{field_to_rename}' """)

# Fields in header
elif field in header.infos:
elif field in header.infos and "INFO" in list(
table_describe.get("column_name")
):

# Field info
field_infos = header.infos.get(field, None)
Expand Down Expand Up @@ -12734,6 +12758,82 @@ def create_annotations_view(
msg_err = f"Field '{field}' is not found (in table or header): '{field}' will be set to NULL"
log.debug(msg=msg_err)

# Samples struct

# Init
samples_format_struct_clause = ""

# If samples and struct as option
if sample_struct_column and len(samples):

# Struct by samples
samples_format_struct = []

# Format info
format_infos = header.formats

# For each sample
for sample in samples:

# Struct by format
sample_format_struct = []

# For each format
for format in formats:

# Format cast and list
format_cast = ""
format_list = False
format_cast = code_type_map_to_sql.get(
format_infos.get(format).type, "VARCHAR"
)
if format_infos.get(format).num != 1:
format_list = True

# If format is a list
if format_list:
sample_format_struct.append(
f"""
"{format}":=
list_transform(
string_split(
NULLIF(
string_split("{sample}", ':')[list_position(string_split("FORMAT", ':'), '{format}')]
, ''
)
, ',')
, x -> CASE WHEN x = '.' OR x = '' THEN NULL ELSE x END
)::{format_cast}[]
"""
)
# If format is NOT a list
else:
sample_format_struct.append(
f"""
"{format}":=
COALESCE(
NULLIF(
regexp_replace(
string_split("{sample}", ':')[list_position(string_split("FORMAT", ':'), '{format}')]
, '^\\.$', ''
)
, ''
)
)::{format_cast}
"""
)

# Add struct of the sample
samples_format_struct.append(
f"""
"{sample}":= STRUCT_PACK({", ".join(sample_format_struct)})
"""
)

samples_format_struct_clause = f"""
, STRUCT_PACK({", ".join(samples_format_struct)}) AS {sample_struct_column}
"""

# Combine fields into a STRUCT
if info_struct_column and len(fields_columns_annotations_struct):
annotations_column_annotations_struct = f"""
Expand All @@ -12757,7 +12857,7 @@ def create_annotations_view(
# Query select
query_select = f"""
SELECT
{', '.join([f'"{field}"' for field in fields_needed])} {annotations_column_annotations_columns} {annotations_column_annotations_struct}
{', '.join([f'"{field}"' for field in fields_needed])} {annotations_column_annotations_columns} {annotations_column_annotations_struct} {samples_format_struct_clause}
FROM
{table}
{limit_clause}
Expand Down
Loading

0 comments on commit 20bc553

Please sign in to comment.