Skip to content

Commit

Permalink
fix barcode calculation and other functions with sample to prevent sp…
Browse files Browse the repository at this point in the history
…ecial characters in name #321, docs #4
  • Loading branch information
antonylebechec committed Dec 9, 2024
1 parent 555435e commit 39886a1
Show file tree
Hide file tree
Showing 16 changed files with 27,407 additions and 27,276 deletions.
Binary file modified README.pdf
Binary file not shown.
Binary file modified RELEASE_NOTES.pdf
Binary file not shown.
Binary file modified docs/docs.pdf
Binary file not shown.
Binary file modified docs/help.configuration.calculation.pdf
Binary file not shown.
Binary file modified docs/help.configuration.pdf
Binary file not shown.
Binary file modified docs/help.configuration.prioritization.pdf
Binary file not shown.
Binary file modified docs/help.parameters.databases.pdf
Binary file not shown.
Binary file modified docs/help.parameters.pdf
Binary file not shown.
Binary file modified docs/help.pdf
Binary file not shown.
54,545 changes: 27,277 additions & 27,268 deletions docs/pdoc/howard/objects/variants.html

Large diffs are not rendered by default.

Binary file modified docs/tips.pdf
Binary file not shown.
Binary file modified docs/user_guide.pdf
Binary file not shown.
19 changes: 11 additions & 8 deletions howard/objects/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2514,7 +2514,9 @@ def export_variant_vcf(
if not list_samples:
list_samples = self.get_header_sample_list()
if list_samples:
samples_fields = " , FORMAT , " + " , ".join(list_samples)
samples_fields = " , FORMAT , " + " , ".join(
[f""" "{sample}" """ for sample in list_samples]
)
else:
samples_fields = ""
log.debug(f"samples_fields: {samples_fields}")
Expand Down Expand Up @@ -9135,7 +9137,7 @@ def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:

# variant_id, FORMAT and samples
samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
self.get_header_sample_list()
[f""" "{sample}" """ for sample in self.get_header_sample_list()]
)

# Create dataframe
Expand Down Expand Up @@ -9235,7 +9237,7 @@ def calculation_genotype_concordance(self) -> None:

# variant_id, FORMAT and samples
samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
self.get_header_sample_list()
[f""" "{sample}" """ for sample in self.get_header_sample_list()]
)

# Create dataframe
Expand Down Expand Up @@ -9341,7 +9343,7 @@ def calculation_barcode(self, tag: str = "barcode") -> None:

# variant_id, FORMAT and samples
samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
self.get_header_sample_list()
[f""" "{sample}" """ for sample in self.get_header_sample_list()]
)

# Create dataframe
Expand Down Expand Up @@ -9510,7 +9512,7 @@ def calculation_barcode_family(self, tag: str = "BCF") -> None:

# variant_id, FORMAT and samples
samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
ped_samples
[f""" "{sample}" """ for sample in ped_samples]
)

# Create dataframe
Expand Down Expand Up @@ -9546,7 +9548,8 @@ def calculation_barcode_family(self, tag: str = "BCF") -> None:
for sample in self.get_header_sample_list() + ["FORMAT"]:
if sample in ped_samples:
value = f'dataframe_barcode."{barcode_infos}"'
value_samples = "'" + ",".join(ped_samples) + "'"
value_samples = "'" + ",".join([f""" "{sample}" """ for sample in ped_samples]) + "'"
ped_samples
elif sample == "FORMAT":
value = f"'{tag}'"
value_samples = f"'{tag}S'"
Expand Down Expand Up @@ -9710,7 +9713,7 @@ def calculation_trio(self) -> None:

# variant_id, FORMAT and samples
samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
self.get_header_sample_list()
[f""" "{sample}" """ for sample in self.get_header_sample_list()]
)

# Create dataframe
Expand Down Expand Up @@ -9917,7 +9920,7 @@ def calculation_genotype_stats(self, info: str = "VAF") -> None:

# variant_id, FORMAT and samples
samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
self.get_header_sample_list()
[f""" "{sample}" """ for sample in self.get_header_sample_list()]
)

# Create dataframe
Expand Down
Binary file modified plugins/README.pdf
Binary file not shown.
62 changes: 62 additions & 0 deletions tests/data/example.name_with_special_char.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate=20140624
##source=./export.pl release 1.5
##reference=IRC
##phasing=unknown
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##FILTER=<ID=FSFilter,Description="FS > 200.0">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=QDFilter,Description="QD < 2.0">
##FILTER=<ID=ReadPosFilter,Description="ReadPosRankSum < -20.0">
##FILTER=<ID=TruthSensitivityTranche99.00to99.90,Description="Truth sensitivity tranche level at VSQ Lod: -3.9813 <= x < 3.448">
##FILTER=<ID=TruthSensitivityTranche99.90to100.00+,Description="Truth sensitivity tranche level at VQS Lod < -29586.8217">
##FILTER=<ID=TruthSensitivityTranche99.90to100.00,Description="Truth sensitivity tranche level at VSQ Lod: -29586.8217 <= x < -3.9813">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=AD,Number=2,Type=Integer,Description="Allelic Depth">
##FORMAT=<ID=FILTER,Number=.,Type=String,Description="Filter quality">
##FORMAT=<ID=QUAL,Number=1,Type=Integer,Description="Filter quality value">
##INFO=<ID=CLNSIG,Number=1,Type=String,Description="CLNSIG">
##contig=<ID=chr1,length=249250621,assembly=hg19>
##contig=<ID=chr7,length=159138663,assembly=hg19>
##contig=<ID=1,length=249250621,assembly=hg19>
##contig=<ID=10,length=135534747,assembly=hg19>
##contig=<ID=11,length=135006516,assembly=hg19>
##contig=<ID=12,length=133851895,assembly=hg19>
##contig=<ID=13,length=115169878,assembly=hg19>
##contig=<ID=14,length=107349540,assembly=hg19>
##contig=<ID=15,length=102531392,assembly=hg19>
##contig=<ID=16,length=90354753,assembly=hg19>
##contig=<ID=17,length=81195210,assembly=hg19>
##contig=<ID=18,length=78077248,assembly=hg19>
##contig=<ID=19,length=59128983,assembly=hg19>
##contig=<ID=2,length=243199373,assembly=hg19>
##contig=<ID=20,length=63025520,assembly=hg19>
##contig=<ID=21,length=48129895,assembly=hg19>
##contig=<ID=22,length=51304566,assembly=hg19>
##contig=<ID=3,length=198022430,assembly=hg19>
##contig=<ID=4,length=191154276,assembly=hg19>
##contig=<ID=5,length=180915260,assembly=hg19>
##contig=<ID=6,length=171115067,assembly=hg19>
##contig=<ID=7,length=159138663,assembly=hg19>
##contig=<ID=8,length=146364022,assembly=hg19>
##contig=<ID=9,length=141213431,assembly=hg19>
##contig=<ID=M,length=16571,assembly=hg19>
##contig=<ID=X,length=155270560,assembly=hg19>
##contig=<ID=Y,length=59373566,assembly=hg19>
##INFO=<ID=SIFT,Number=.,Type=String,Description="Annotation 'SIFT'">
##bcftools_viewVersion=1.15.1+htslib-1.15.1
##bcftools_viewCommand=view tests/data/example.vcf.gz; Date=Fri Mar 10 21:25:44 2023
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample-1 sample-2 sample-3 sample-4
chr1 28736 . A C 100 PASS CLNSIG=pathogenic GT:AD:DP:GQ 0/1:525,204:729:99 0/1:12659,4994:17664:99 1/1:12658,4995:17663:99 1/1:401,175:576:99
chr1 35144 . A C 100 PASS CLNSIG=non-pathogenic GT:AD:DP:GQ ./.:.:.:. 0/1:12659,4994:17664:99 0/1:12658,4995:17663:99 0/1:401,175:576:99
chr1 69101 . A G 100 PASS DP=50 GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99
chr1 768251 . A G 100 PASS . GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99
chr1 768252 . A G 100 PASS . GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99
chr1 768253 . A G 100 PASS . GT:AD:DP:GQ 0/1:525,204:729:99 ./.:.:.:. 0/1:12658,4995:17663:99 0/1:401,175:576:99
chr7 55249063 rs1050171 G A 5777 PASS DP=125 GT:AD:DP:GQ 0/1:525,204:729:99 0/1:12659,4994:17664:99 ./.:.:.:. 0/1:401,175:576:99
57 changes: 57 additions & 0 deletions tests/test_variants_calculations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,63 @@ def test_calculation_barcode():
assert False


def test_calculation_barcode_sample_name_special_char():
"""
This is a test function for a Python script that calculates barcode information from a VCF file and
checks if the output is correct.
"""

with TemporaryDirectory(dir=tests_folder) as tmp_dir:

# Init files
input_vcf = tests_data_folder + "/example.name_with_special_char.vcf"
output_vcf = f"{tmp_dir}/output.vcf.gz"

# Construct param dict
param = {"calculation": {"calculations": {"BARCODE": None}}}

# Create object
variants = Variants(
conn=None, input=input_vcf, output=output_vcf, param=param, load=True
)

# Calculation
variants.calculation()

result = variants.get_query_to_df(
""" SELECT INFO FROM variants WHERE INFO LIKE '%barcode%' """
)
assert len(result) == 7

result = variants.get_query_to_df(
""" SELECT * FROM variants WHERE INFO LIKE '%barcode=1122%' """
)
assert len(result) == 1

result = variants.get_query_to_df(
""" SELECT * FROM variants WHERE INFO LIKE '%barcode=0111%' """
)
assert len(result) == 1

result = variants.get_query_to_df(
""" SELECT * FROM variants WHERE INFO LIKE '%barcode=1011%' """
)
assert len(result) == 4

result = variants.get_query_to_df(
""" SELECT * FROM variants WHERE INFO LIKE '%barcode=1101%' """
)
assert len(result) == 1

# Check if VCF is in correct format with pyVCF
remove_if_exists([output_vcf])
variants.export_output()
try:
vcf.Reader(filename=output_vcf)
except:
assert False


def test_calculation_barcode_genotype():
"""
The function `test_calculation_barcode_genotype` is a test function in Python that calculates
Expand Down

0 comments on commit 39886a1

Please sign in to comment.