From 95d52dbacae64b5a1fc5a22f51bdd5b1ccc48fc3 Mon Sep 17 00:00:00 2001 From: antonylebechec Date: Fri, 8 Mar 2024 00:48:53 +0100 Subject: [PATCH] Add annotation strategy (update, append) #165, add docs #4 --- docs/help.html | 10 ++ docs/help.md | 16 +++ docs/help.param.html | 16 +-- docs/help.param.json | 42 +++++-- docs/help.param.md | 50 +++++--- howard/objects/variants.py | 25 ++-- howard/tools/annotation.py | 12 ++ howard/tools/tools.py | 24 +++- tests/data/example.nci60_1.vcf | 63 ++++++++++ ...{example.nci60.vcf => example.nci60_2.vcf} | 2 +- tests/test_variants_annotations.py | 112 ++++++++++++++++-- 11 files changed, 320 insertions(+), 52 deletions(-) create mode 100644 tests/data/example.nci60_1.vcf rename tests/data/{example.nci60.vcf => example.nci60_2.vcf} (96%) diff --git a/docs/help.html b/docs/help.html index d3dac82..3b51717 100644 --- a/docs/help.html +++ b/docs/help.html @@ -150,6 +150,16 @@

HOWARD Help

Default assembly Default: 'hg19' +
--annotations_update
+Update option for annotation (Only for Parquet annotation).
+If True, annotation fields will be removed and re-annotated.
+These options will be applied to all annotation databases.default: False
+
+
--annotations_append
+Append option for annotation (Only for Parquet annotation).
+If True, annotation fields will be annotated only if not annotation exists for the variant.
+These options will be applied to all annotation databases.default: False
+
 

CALCULATION

Calculation processes variants information to generate new information, such as: identify variation type (VarType), harmonizes allele frequency (VAF) and calculate sttistics (VAF_stats), extracts Nomen (transcript, cNomen, pNomen...) from an HGVS field (e.g. snpEff, Annovar) with an optional list of personalized transcripts, generates VaRank format barcode, identify trio inheritance.

Usage examples:
   howard calculation --input=tests/data/example.full.vcf --output=/tmp/example.calculation.tsv --calculations='vartype'
   howard calculation --input=tests/data/example.ann.vcf.gz --output=/tmp/example.calculated.tsv --calculations='snpeff_hgvs,NOMEN' --hgvs_field=snpeff_hgvs --transcripts=tests/data/transcripts.tsv
   howard calculation --show_calculations

Main options

--input=<input>
diff --git a/docs/help.md b/docs/help.md
index 178d01f..660dde4 100644
--- a/docs/help.md
+++ b/docs/help.md
@@ -347,6 +347,22 @@ Default assembly
 Default: 'hg19'
 ```
 
+```
+--annotations_update
+
+Update option for annotation (Only for Parquet annotation).
+If True, annotation fields will be removed and re-annotated.
+These options will be applied to all annotation databases.default: False
+```
+
+```
+--annotations_append
+
+Append option for annotation (Only for Parquet annotation).
+If True, annotation fields will be annotated only if not annotation exists for the variant.
+These options will be applied to all annotation databases.default: False
+```
+
 
 
 ## CALCULATION tool
diff --git a/docs/help.param.html b/docs/help.param.html
index 02e538e..de4b36b 100644
--- a/docs/help.param.html
+++ b/docs/help.param.html
@@ -1,6 +1,6 @@
 

HOWARD Parameters

HOWARD Parameters JSON file defined parameters to process annotations, prioritization, calculations, convertions and queries.

Table of contents

- - HOWARD Parameters
      - annotations
            - parquet
                  - annotations
                  - update
            - bcftools
                  - annotations
            - annovar
                  - annotations
                  - options
            - snpeff
                  - options
            - exomiser
                  - release
+ - HOWARD Parameters
      - annotations
            - parquet
                  - annotations
            - bcftools
                  - annotations
            - annovar
                  - annotations
                  - options
            - snpeff
                  - options
            - exomiser
                  - release
            - options
                  - update
                  - append

annotations

Annotation process using HOWARD algorithms or external tools.
For HOWARD Parquet algorithm, specify the list of database files available (formats such as Parquet, VCF, TSV, duckDB, JSON). This parameter enables users to select specific database fields and optionally rename them. Use 'INFO' keyword to select all fields within the database. If a full path is not provided, the system will automatically detect files within database folders (see Configuration doc) and assembly (see Parameter option).
For external tools, such as Annovar, snpEff and Exomiser, specify parameters such as annotation keywords (Annovar) and options (depending on the tool).
Examples: # Annotation with multiple tools in multiple formats with multiple options "annotation": { @@ -63,8 +63,7 @@ <H2 id='annotations'>annotations</H2>Annotation process using HOWARD algorithms "REVEL_rankscore": null } } -

annotations::parquet::update

Update option for Parquet annotation. If True, annotation fields will be updated if exists. If False, annotation fields will not change if it already exists. These options will be applied to all annotation databases.
Examples: # Apply update on all annotation fields for all databases. -"update": True

annotations::bcftools

Annotation process using BCFTools. Provide a list of database files and annotation fields.
Examples: # Annotation with multiple databases in multiple formats +

annotations::bcftools

Annotation process using BCFTools. Provide a list of database files and annotation fields.
Examples: # Annotation with multiple databases in multiple formats "parquet": { "bcftools": { "/path/to/database1.vcf.gz": { @@ -146,12 +145,15 @@ <H2 id='annotations'>annotations</H2>Annotation process using HOWARD algorithms " -hgvs -noShiftHgvs -spliceSiteSize 3 -lof -oicr "} } }

annotations::snpeff::options

String (as command line) of options available such as:
- filters on variants (regions filter, specific changes as intronic or downstream)
- annotation (e.g. HGVS, loss of function)
- database (e.g. only protein coding transcripts, splice sites size)
Examples: # Annotation with snpEff databases, with options to generate HGVS annotation, specify to not shift variants according to HGVS notation, define splice sites size to 3, add loss of function (LOF), Nonsense mediated decay and OICR tags. -"options": { - " -hgvs -noShiftHgvs -spliceSiteSize 3 -lof -oicr "} -}

annotations::exomiser

Annotation process using Exomiser tool and options (see [Exomiser website documentation](https://www.sanger.ac.uk/tool/exomiser/)).
Examples: # Annotation with Exomiser, using database relse '2109', transcripts source as UCSC and a list of HPO terms. +"options": " -hgvs -noShiftHgvs -spliceSiteSize 3 -lof -oicr "

annotations::exomiser

Annotation process using Exomiser tool and options (see [Exomiser website documentation](https://www.sanger.ac.uk/tool/exomiser/)).
Examples: # Annotation with Exomiser, using database relse '2109', transcripts source as UCSC and a list of HPO terms. "exomiser": { "release": "2109" "transcript_source": "refseq" "hpo": ['HP:0001156', 'HP:0001363', 'HP:0011304', 'HP:0010055'] }

annotations::exomiser::release

Release of Exomiser database. This option replace the release variable in 'application.properties' file (see 'exomiser_application_properties' option). The release will be downloaded if it is not available locally.
Examples: # Annotation with release '2109' of Exomiser database. -"release": "2109"
\ No newline at end of file +"release": "2109"

annotations::options

Options for annotations, such as annotation strategy (skip if exists, update, append)
Examples: # Annotation with Parquet databases, with update annotation strategy. +"options": { + "update": True +}

annotations::options::update

Update option for annotation (only for Parquet annotation). If True, annotation fields will be removed and re-annotated. These options will be applied to all annotation databases.
Examples: # Apply update on all annotation fields for all databases. +"update": True

annotations::options::append

Append option for annotation (only for Parquet annotation). If True, annotation fields will be annotated only if not annotation exists for the variant. These options will be applied to all annotation databases.
Examples: # Apply append on all annotation fields for all databases. +"append": True
\ No newline at end of file diff --git a/docs/help.param.json b/docs/help.param.json index 738777d..aeea936 100644 --- a/docs/help.param.json +++ b/docs/help.param.json @@ -86,15 +86,6 @@ "}", "" ] - }, - "update": { - "__help": [ - "Update option for Parquet annotation. If True, annotation fields will be updated if exists. If False, annotation fields will not change if it already exists. These options will be applied to all annotation databases." - ], - "__examples_code": [ - "# Apply update on all annotation fields for all databases.", - "\"update\": True" - ] } }, "bcftools": { @@ -234,9 +225,7 @@ ], "__examples_code": [ "# Annotation with snpEff databases, with options to generate HGVS annotation, specify to not shift variants according to HGVS notation, define splice sites size to 3, add loss of function (LOF), Nonsense mediated decay and OICR tags.", - "\"options\": {", - " \" -hgvs -noShiftHgvs -spliceSiteSize 3 -lof -oicr \"}", - "}" + "\"options\": \" -hgvs -noShiftHgvs -spliceSiteSize 3 -lof -oicr \"" ] } }, @@ -261,6 +250,35 @@ "\"release\": \"2109\"" ] } + }, + "options": { + "__help": [ + "Options for annotations, such as annotation strategy (skip if exists, update, append)" + ], + "__examples_code": [ + "# Annotation with Parquet databases, with update annotation strategy.", + "\"options\": {", + " \"update\": True", + "}" + ], + "update": { + "__help": [ + "Update option for annotation (only for Parquet annotation). If True, annotation fields will be removed and re-annotated. These options will be applied to all annotation databases." + ], + "__examples_code": [ + "# Apply update on all annotation fields for all databases.", + "\"update\": True" + ] + }, + "append": { + "__help": [ + "Append option for annotation (only for Parquet annotation). If True, annotation fields will be annotated only if not annotation exists for the variant. These options will be applied to all annotation databases." + ], + "__examples_code": [ + "# Apply append on all annotation fields for all databases.", + "\"append\": True" + ] + } } } } diff --git a/docs/help.param.md b/docs/help.param.md index b2f529b..a73b239 100644 --- a/docs/help.param.md +++ b/docs/help.param.md @@ -8,7 +8,6 @@ HOWARD Parameters JSON file defined parameters to process annotations, prioritiz - [annotations](#annotations) - [parquet](#annotationsparquet) - [annotations](#annotationsparquetannotations) - - [update](#annotationsparquetupdate) - [bcftools](#annotationsbcftools) - [annotations](#annotationsbcftoolsannotations) - [annovar](#annotationsannovar) @@ -18,6 +17,9 @@ HOWARD Parameters JSON file defined parameters to process annotations, prioritiz - [options](#annotationssnpeffoptions) - [exomiser](#annotationsexomiser) - [release](#annotationsexomiserrelease) + - [options](#annotationsoptions) + - [update](#annotationsoptionsupdate) + - [append](#annotationsoptionsappend) ## annotations @@ -112,16 +114,6 @@ Examples: ``` -#### annotations::parquet::update - -Update option for Parquet annotation. If True, annotation fields will be updated if exists. If False, annotation fields will not change if it already exists. These options will be applied to all annotation databases. - -Examples: -``` -# Apply update on all annotation fields for all databases. -"update": True -``` - ### annotations::bcftools Annotation process using BCFTools. Provide a list of database files and annotation fields. @@ -270,9 +262,7 @@ String (as command line) of options available such as: Examples: ``` # Annotation with snpEff databases, with options to generate HGVS annotation, specify to not shift variants according to HGVS notation, define splice sites size to 3, add loss of function (LOF), Nonsense mediated decay and OICR tags. -"options": { - " -hgvs -noShiftHgvs -spliceSiteSize 3 -lof -oicr "} -} +"options": " -hgvs -noShiftHgvs -spliceSiteSize 3 -lof -oicr " ``` ### annotations::exomiser @@ -299,3 +289,35 @@ Examples: "release": "2109" ``` +### annotations::options + +Options for annotations, such as annotation strategy (skip if exists, update, append) + +Examples: +``` +# Annotation with Parquet databases, with update annotation strategy. +"options": { + "update": True +} +``` + +#### annotations::options::update + +Update option for annotation (only for Parquet annotation). If True, annotation fields will be removed and re-annotated. These options will be applied to all annotation databases. + +Examples: +``` +# Apply update on all annotation fields for all databases. +"update": True +``` + +#### annotations::options::append + +Append option for annotation (only for Parquet annotation). If True, annotation fields will be annotated only if not annotation exists for the variant. These options will be applied to all annotation databases. + +Examples: +``` +# Apply append on all annotation fields for all databases. +"append": True +``` + diff --git a/howard/objects/variants.py b/howard/objects/variants.py index 6697336..00c7aea 100644 --- a/howard/objects/variants.py +++ b/howard/objects/variants.py @@ -4143,8 +4143,10 @@ def annotation_parquet(self, threads: int = None) -> None: assembly = self.get_param().get("assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)) # Force Update Annotation - force_update_annotation = self.get_param().get("annotation", {}).get("parquet", {}).get("update", False) + force_update_annotation = self.get_param().get("annotation", {}).get("options", {}).get("update", False) log.debug(f"force_update_annotation={force_update_annotation}") + force_append_annotation = self.get_param().get("annotation", {}).get("options", {}).get("append", False) + log.debug(f"force_append_annotation={force_append_annotation}") # Data table_variants = self.get_table_variants() @@ -4301,7 +4303,9 @@ def annotation_parquet(self, threads: int = None) -> None: # To annotate #force_update_annotation = True - if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): + #force_append_annotation = True + #if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): + if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or force_append_annotation or (annotation_fields_new_name not in self.get_header().infos)): # Add field to annotation to process list annotation_fields_processed.append( @@ -4354,11 +4358,17 @@ def annotation_parquet(self, threads: int = None) -> None: self.code_type_map[parquet_hdr_vcf_header_infos_type] ) + # Append + if force_append_annotation: + query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ + else: + query_case_when_append = "" + # Annotation/Update query fields # Found in INFO column if annotation_field_column == "INFO" and "INFO" in parquet_hdr_vcf_header_columns: sql_query_annotation_update_info_sets.append(f""" - CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') + CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) ELSE '' END @@ -4366,7 +4376,7 @@ def annotation_parquet(self, threads: int = None) -> None: # Found in a specific column else: sql_query_annotation_update_info_sets.append(f""" - CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') + CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) ELSE '' END @@ -4389,7 +4399,8 @@ def annotation_parquet(self, threads: int = None) -> None: f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})") # Check if ALL fields have to be annotated. Thus concat all INFO field - allow_annotation_full_info = True + #allow_annotation_full_info = True + allow_annotation_full_info = not force_append_annotation if parquet_type in ["regions"]: allow_annotation_full_info = False @@ -4557,8 +4568,8 @@ def annotation_parquet(self, threads: int = None) -> None: # Add update query to dict query_dict[f"{chrom}:{sql_query_interval_start}-{sql_query_interval_stop}"] = sql_query_annotation_chrom_interval_pos - # log.debug( - # "Create SQL query: " + str(sql_query_annotation_chrom_interval_pos)) + log.debug( + "Create SQL query: " + str(sql_query_annotation_chrom_interval_pos)) # Interval Start/Stop sql_query_interval_start = sql_query_interval_stop diff --git a/howard/tools/annotation.py b/howard/tools/annotation.py index 97e8995..ba5f6d0 100644 --- a/howard/tools/annotation.py +++ b/howard/tools/annotation.py @@ -47,6 +47,12 @@ def annotation(args:argparse) -> None: params = vcfdata_obj.get_param() + # Prapare annotation dict + if not params.get("annotation", None): + params["annotation"] = {} + if not params.get("annotation", {}).get("options", None): + params["annotation"]["options"] = {} + # Quick Annotation if args.annotations: annotation_file_list = [value for value in args.annotations.split(',')] @@ -56,6 +62,12 @@ def annotation(args:argparse) -> None: param_quick_annotations[annotation_file] = {"INFO": None} params["annotations"] = param_quick_annotations + if args.annotations_update: + params["annotation"]["options"]["update"] = True + + if args.annotations_append: + params["annotation"]["options"]["append"] = True + vcfdata_obj.set_param(params) # Load data from input file diff --git a/howard/tools/tools.py b/howard/tools/tools.py index ec613e4..d6d2da7 100644 --- a/howard/tools/tools.py +++ b/howard/tools/tools.py @@ -98,6 +98,8 @@ "widget": "FileSaver" } }, + + # Annotations "annotations": { "metavar": "annotations", "help": """Annotation with databases files, or with tools\n""" @@ -113,6 +115,24 @@ "widget": "MultiFileChooser" } }, + "annotations_update": { + "help": """Update option for annotation (Only for Parquet annotation).\n""" + """If True, annotation fields will be removed and re-annotated.\n""" + """These options will be applied to all annotation databases.""" + """default: False""", + "action": "store_true", + "default": False + }, + "annotations_append": { + "help": """Append option for annotation (Only for Parquet annotation).\n""" + """If True, annotation fields will be annotated only if not annotation exists for the variant.\n""" + """These options will be applied to all annotation databases.""" + """default: False""", + "action": "store_true", + "default": False + }, + + # Calculations "calculations": { "metavar": "operations", "help": """Calculations on genetic variants information and genotype information\n""" @@ -1162,7 +1182,9 @@ "input": True, "output": True, "annotations": True, - "assembly": False + "assembly": False, + "annotations_update": False, + "annotations_append": False } } }, diff --git a/tests/data/example.nci60_1.vcf b/tests/data/example.nci60_1.vcf new file mode 100644 index 0000000..9dd06b5 --- /dev/null +++ b/tests/data/example.nci60_1.vcf @@ -0,0 +1,63 @@ +##fileformat=VCFv4.1 +##FILTER= +##fileDate=20140624 +##source=./export.pl release 1.5 +##reference=IRC +##phasing=unknown +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= 200.0"> +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +##bcftools_viewVersion=1.15.1+htslib-1.15.1 +##bcftools_viewCommand=view tests/data/example.vcf.gz; Date=Fri Mar 10 21:25:44 2023 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3 sample4 +chr1 28736 . A C 100 PASS CLNSIG=pathogenic GT:AD:DP:GQ 0/1:525,204:729:99 0/1:12659,4994:17664:99 1/1:12658,4995:17663:99 1/1:401,175:576:99 +chr1 35144 . A C 100 PASS CLNSIG=non-pathogenic GT:AD:DP:GQ ./.:.:.:. 0/1:12659,4994:17664:99 0/1:12658,4995:17663:99 0/1:401,175:576:99 +chr1 69101 . A G 100 PASS DP=50 GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99 +chr1 768251 . A G 100 PASS . GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99 +chr1 768252 . A G 100 PASS . GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99 +chr1 768253 . A G 100 PASS nci60=0.321 GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99 +chr7 55249063 rs1050171 G A 5777 PASS DP=125 GT:AD:DP:GQ 0/1:525,204:729:99 0/1:12659,4994:17664:99 ./.:.:.:. 0/1:401,175:576:99 diff --git a/tests/data/example.nci60.vcf b/tests/data/example.nci60_2.vcf similarity index 96% rename from tests/data/example.nci60.vcf rename to tests/data/example.nci60_2.vcf index 6ce426f..1acff86 100644 --- a/tests/data/example.nci60.vcf +++ b/tests/data/example.nci60_2.vcf @@ -59,5 +59,5 @@ chr1 35144 . A C 100 PASS CLNSIG=non-pathogenic GT:AD:DP:GQ ./.:.:.:. 0/1:12659, chr1 69101 . A G 100 PASS DP=50 GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99 chr1 768251 . A G 100 PASS . GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99 chr1 768252 . A G 100 PASS . GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99 -chr1 768253 . A G 100 PASS . GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99 +chr1 768253 . A G 100 PASS nci60=0.321 GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99 chr7 55249063 rs1050171 G A 5777 PASS DP=125;nci60=0.123 GT:AD:DP:GQ 0/1:525,204:729:99 0/1:12659,4994:17664:99 ./.:.:.:. 0/1:401,175:576:99 diff --git a/tests/test_variants_annotations.py b/tests/test_variants_annotations.py index e43f3ad..386fe72 100644 --- a/tests/test_variants_annotations.py +++ b/tests/test_variants_annotations.py @@ -27,16 +27,102 @@ + +def test_annotation_parquet_append(): + """ + The function `test_annotation_parquet_append` tests the annotation functionality for appending data + to a Parquet file in a VCF file. + """ + + with TemporaryDirectory(dir=tests_folder) as tmp_dir: + + # Init files + input_vcf = tests_data_folder + "/example.nci60_1.vcf" + annotation1 = os.path.join(tests_annotations_folder, "nci60.parquet") + output_vcf = f"{tmp_dir}/output.vcf.gz" + + # Construct param dict + param = { + 'annotation': { + 'parquet': { + 'annotations': { + annotation1: { + "nci60": "nci60" + } + }, + }, + 'options': { + 'append': False + } + } + } + param_update = { + 'annotation': { + 'parquet': { + 'annotations': { + annotation1: { + "nci60": "nci60" + } + }, + }, + 'options': { + 'append': True + } + + } + } + log.debug(f"param={param}") + log.debug(f"param_update={param_update}") + + # Create object + variants = Variants(conn=None, input=input_vcf, output=output_vcf, param=param, load=True) + + # Remove if output file exists + remove_if_exists([output_vcf]) + + # Annotation + variants.annotation() + + # Check annotation not changed + + result = variants.get_query_to_df("SELECT INFO FROM variants") + log.debug(result) + result1 = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr1' AND POS = 768253 AND REF = 'A' AND ALT = 'G' AND INFO LIKE '%nci60=0.321%'") + result2 = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr7' AND POS = 55249063 AND REF = 'G' AND ALT = 'A' AND INFO LIKE '%nci60=0.66%'") + #log.debug(result1) + assert len(result1) == 1 + assert len(result2) == 0 + + variants.set_param(param=param_update) + variants.annotation() + + # Check annotation changed (existing kept, one annotation added) + result = variants.get_query_to_df("SELECT INFO FROM variants") + log.debug(result) + result1 = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr1' AND POS = 768253 AND REF = 'A' AND ALT = 'G' AND INFO LIKE '%nci60=0.321%'") + result2 = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr7' AND POS = 55249063 AND REF = 'G' AND ALT = 'A' AND INFO LIKE '%nci60=0.66%'") + #log.debug(result) + assert len(result1) == 1 + assert len(result2) == 1 + + # Check if VCF is in correct format with pyVCF + variants.export_output() + try: + vcf.Reader(filename=output_vcf) + except: + assert False + + def test_annotation_parquet_update(): """ - This function tests if a field already present in a VCF file is not changed during annotation with a - Parquet file. + The function `test_annotation_parquet_update` tests the updating functionality of annotations in a + VCF file using Parquet format. """ with TemporaryDirectory(dir=tests_folder) as tmp_dir: # Init files - input_vcf = tests_data_folder + "/example.nci60.vcf" + input_vcf = tests_data_folder + "/example.nci60_2.vcf" annotation1 = os.path.join(tests_annotations_folder, "nci60.parquet") output_vcf = f"{tmp_dir}/output.vcf.gz" @@ -49,6 +135,8 @@ def test_annotation_parquet_update(): "nci60": "nci60" } }, + }, + 'options': { 'update': False } } @@ -61,6 +149,8 @@ def test_annotation_parquet_update(): "nci60": "nci60" } }, + }, + 'options': { 'update': True } } @@ -78,17 +168,19 @@ def test_annotation_parquet_update(): variants.annotation() # Check annotation not changed - result = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr7' AND POS = 55249063 AND REF = 'G' AND ALT = 'A' AND INFO LIKE '%nci60=0.123%'") - log.debug(result) - assert len(result) == 1 + result1 = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr1' AND POS = 768253 AND REF = 'A' AND ALT = 'G' AND INFO LIKE '%nci60=0.321%'") + result2 = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr7' AND POS = 55249063 AND REF = 'G' AND ALT = 'A' AND INFO LIKE '%nci60=0.123%'") + assert len(result1) == 1 + assert len(result2) == 1 variants.set_param(param=param_update) variants.annotation() - # Check annotation changed - result = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr7' AND POS = 55249063 AND REF = 'G' AND ALT = 'A' AND INFO LIKE '%nci60=0.66%'") - log.debug(result) - assert len(result) == 1 + # Check annotation changed (all removed, but one added) + result1 = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr1' AND POS = 768253 AND REF = 'A' AND ALT = 'G' AND INFO LIKE '%nci60=0.321%'") + result2 = variants.get_query_to_df("SELECT 1 AS count FROM variants WHERE \"#CHROM\" = 'chr7' AND POS = 55249063 AND REF = 'G' AND ALT = 'A' AND INFO LIKE '%nci60=0.66%'") + assert len(result1) == 0 + assert len(result2) == 1 # Check if VCF is in correct format with pyVCF variants.export_output()