From 8d41bd2898a496b169ad4e97f1940bea1aaac29a Mon Sep 17 00:00:00 2001 From: ASLeonard Date: Mon, 4 Dec 2023 12:33:22 +0100 Subject: [PATCH 1/6] add haploid support for v1.6 --- snakepit/deepvariant.smk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/snakepit/deepvariant.smk b/snakepit/deepvariant.smk index ca85265..59f1dfe 100644 --- a/snakepit/deepvariant.smk +++ b/snakepit/deepvariant.smk @@ -127,7 +127,8 @@ rule deepvariant_postprocess: gvcf = multiext('{run}/deepvariant/{sample}.{region}.g.vcf.gz','','.tbi') params: variants = lambda wildcards,input: PurePath(input.variants).with_name('call_variants_output@1.tfrecord.gz'), - gvcf = lambda wildcards,input: PurePath(input.gvcf[0]).with_suffix('').with_suffix(f'.tfrecord@{config["shards"]}.gz') + gvcf = lambda wildcards,input: PurePath(input.gvcf[0]).with_suffix('').with_suffix(f'.tfrecord@{config["shards"]}.gz'), + handle_sex_chromosomes = '--haploid_contigs "X,Y" --par_regions_bed "PAR.bed"' if config.get('haploid_sex',False) else '' threads: config.get('resources',{}).get('postprocess',{}).get('threads',2) resources: mem_mb = config.get('resources',{}).get('postprocess',{}).get('mem_mb',20000), @@ -144,6 +145,7 @@ rule deepvariant_postprocess: --gvcf_outfile {output.gvcf[0]} \ --nonvariant_site_tfrecord_path {params.gvcf} \ --novcf_stats_report \ + {params.handle_sex_chromosomes} \ --cpus {threads} ''' From 9dd847c565301b289443ca43df8848f3e25d176c Mon Sep 17 00:00:00 2001 From: ASLeonard Date: Mon, 27 May 2024 17:06:58 +0200 Subject: [PATCH 2/6] partial improvement to DV flow --- snakepit/deepvariant.smk | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/snakepit/deepvariant.smk b/snakepit/deepvariant.smk index 59f1dfe..faeeada 100644 --- a/snakepit/deepvariant.smk +++ b/snakepit/deepvariant.smk @@ -2,7 +2,8 @@ from pathlib import Path,PurePath wildcard_constraints: region = r'\w+', - run = r'\w+' + run = r'\w+', + preset = r'|'.join(config['GL_config']) config.setdefault('Run_name', 'DV_variant_calling') @@ -10,9 +11,12 @@ if 'binding_paths' in config: for path in config['binding_paths']: workflow._singularity_args += f' -B {path}' -regions = list(map(str,range(1,30))) + ['X','Y','MT','unplaced'] +regions = config.get('regionsz',list(map(str,range(1,30))) + ['X','Y','MT','unplaced']) -ruleorder: deepvariant_postprocess > bcftools_scatter +if config.get('scatter_merge',True): + ruleorder: bcftools_scatter > deepvariant_postprocess +else: + ruleorder: deepvariant_postprocess > bcftools_scatter rule all: input: @@ -128,7 +132,7 @@ rule deepvariant_postprocess: params: variants = lambda wildcards,input: PurePath(input.variants).with_name('call_variants_output@1.tfrecord.gz'), gvcf = lambda wildcards,input: PurePath(input.gvcf[0]).with_suffix('').with_suffix(f'.tfrecord@{config["shards"]}.gz'), - handle_sex_chromosomes = '--haploid_contigs "X,Y" --par_regions_bed "PAR.bed"' if config.get('haploid_sex',False) else '' + handle_sex_chromosomes = f'--haploid_contigs "X,Y" --par_regions_bed "{config["haploid_sex"]}"' if 'haploid_sex' in config else '' threads: config.get('resources',{}).get('postprocess',{}).get('threads',2) resources: mem_mb = config.get('resources',{}).get('postprocess',{}).get('mem_mb',20000), @@ -155,7 +159,9 @@ rule bcftools_scatter: regions = '/cluster/work/pausch/vcf_UCD/2023_07/regions.bed' output: expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz',region=regions,allow_missing=True), - expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.csi',region=regions,allow_missing=True) + expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.tbi',region=regions,allow_missing=True) + wildcard_constraints: + region = "(?!all)" params: regions = regions, _dir = lambda wildcards, output: PurePath(output[0]).with_suffix('').with_suffix('').with_suffix('').with_suffix('') @@ -165,11 +171,11 @@ rule bcftools_scatter: walltime = '1h' shell: ''' - bcftools +scatter {input.gvcf[0]} -o {params._dir} -Oz --threads {threads} --write-index -S {input.regions} -x unplaced --no-version + bcftools +scatter {input.gvcf[0]} -o {params._dir} -Oz --threads {threads} -S {input.regions} -x unplaced --no-version for R in {params.regions} do - mv {params._dir}/$R.vcf.gz {params._dir}.$R.g.vcf.gz - mv {params._dir}/$R.vcf.gz.csi {params._dir}.$R.g.vcf.gz.csi + bcftools reheader -f /cluster/work/pausch/inputs/ref/BTA/UCD2.0/GCA_002263795.4_ARS-UCD2.0_genomic.fa.fai {params._dir}/$R.vcf.gz > {params._dir}.$R.g.vcf.gz + tabix -p vcf {params._dir}.$R.g.vcf.gz done ''' @@ -182,8 +188,6 @@ def get_GL_config(preset): rule GLnexus_merge: input: - #gvcfs = expand('{run}/deepvariant/{region}/{sample}.g.vcf.gz',sample=cohort_samples,allow_missing=True), - #tbi = expand('{run}/deepvariant/{region}/{sample}.g.vcf.gz.csi',sample=cohort_samples,allow_missing=True) gvcfs = expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz',sample=cohort_samples,allow_missing=True), tbi = expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.tbi',sample=cohort_samples,allow_missing=True) output: From 729a08cdbdb1d476c36d69c259d67f93fcdb8e30 Mon Sep 17 00:00:00 2001 From: ASLeonard Date: Fri, 31 May 2024 10:11:58 +0200 Subject: [PATCH 3/6] generalise bcftools reheader --- snakepit/deepvariant.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/snakepit/deepvariant.smk b/snakepit/deepvariant.smk index faeeada..5155069 100644 --- a/snakepit/deepvariant.smk +++ b/snakepit/deepvariant.smk @@ -156,7 +156,8 @@ rule deepvariant_postprocess: rule bcftools_scatter: input: gvcf = expand(rules.deepvariant_postprocess.output['gvcf'],region='all',allow_missing=True), - regions = '/cluster/work/pausch/vcf_UCD/2023_07/regions.bed' + regions = '/cluster/work/pausch/vcf_UCD/2023_07/regions.bed', + fai = config['reference'] + ".fai" output: expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz',region=regions,allow_missing=True), expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.tbi',region=regions,allow_missing=True) @@ -174,7 +175,7 @@ rule bcftools_scatter: bcftools +scatter {input.gvcf[0]} -o {params._dir} -Oz --threads {threads} -S {input.regions} -x unplaced --no-version for R in {params.regions} do - bcftools reheader -f /cluster/work/pausch/inputs/ref/BTA/UCD2.0/GCA_002263795.4_ARS-UCD2.0_genomic.fa.fai {params._dir}/$R.vcf.gz > {params._dir}.$R.g.vcf.gz + bcftools reheader -f {input.fai} {params._dir}/$R.vcf.gz > {params._dir}.$R.g.vcf.gz tabix -p vcf {params._dir}.$R.g.vcf.gz done ''' From 365f654766228c54419495f6db4a53c45e18fe4b Mon Sep 17 00:00:00 2001 From: ASLeonard Date: Thu, 5 Dec 2024 10:32:01 +0100 Subject: [PATCH 4/6] updating for latest DV release --- snakepit/deepvariant.smk | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/snakepit/deepvariant.smk b/snakepit/deepvariant.smk index 5155069..1ca8691 100644 --- a/snakepit/deepvariant.smk +++ b/snakepit/deepvariant.smk @@ -31,9 +31,13 @@ def make_custom_example_arguments(model): case 'RNA' | 'WES': return '--channels \'\' --split_skip_reads' case 'PACBIO': - return '--add_hp_channel --alt_aligned_pileup "diff_channels" --max_reads_per_partition "600" --min_mapping_quality "1" --parse_sam_aux_fields --partition_size "25000" --phase_reads --pileup_image_width "199" --norealign_reads --sort_by_haplotypes --track_ref_reads --vsc_min_fraction_indels "0.12"' + return '--alt_aligned_pileup "diff_channels" --max_reads_per_partition "600" --min_mapping_quality "1" --parse_sam_aux_fields --partition_size "25000" --phase_reads --pileup_image_width "147" --norealign_reads --sort_by_haplotypes --track_ref_reads --vsc_min_fraction_indels "0.12" --trim_reads_for_pileup' case 'WGS': return '--channels "insert_size"' + case 'ONT_R104': + return '--alt_aligned_pileup "diff_channels" --max_reads_per_partition "600" --min_mapping_quality "5" --parse_sam_aux_fields --partition_size "25000" --phase_reads --pileup_image_width "99" --norealign_reads --sort_by_haplotypes --track_ref_reads --vsc_min_fraction_snps "0.08" --vsc_min_fraction_indels "0.12" --trim_reads_for_pileup' + case 'MASSEQ': + return '--alt_aligned_pileup "diff_channels" --max_reads_per_partition 0 --min_mapping_quality 1 --parse_sam_aux_fields --partition_size "25000" --phase_reads --pileup_image_width "199" --norealign_reads --sort_by_haplotypes --track_ref_reads --vsc_min_fraction_indels 0.12 --trim_reads_for_pileup --max_reads_for_dynamic_bases_per_region "1500"' case _: return '--channels \'\' --split_skip_reads' @@ -202,17 +206,13 @@ rule GLnexus_merge: walltime = config.get('resources',{}).get('merge',{}).get('walltime','4h'), scratch = '50G' priority: 25 - container: config.get('containers',{}).get('GLnexus','docker://ghcr.io/dnanexus-rnd/glnexus:v1.4.3') shell: ''' - /usr/local/bin/glnexus_cli \ + glnexus_cli \ --dir $TMPDIR/GLnexus.DB \ --config {params.preset} \ --threads {threads} \ --mem-gbytes {params.mem} \ {input.gvcfs} |\ - bcftools view - |\ - bgzip -@ 8 -c > {output[0]} - - tabix -p vcf {output[0]} + bcftools view -@ {threads} -W=tbi {output[0]} ''' From 57025835c6ad2b640aff86c25de43b5c515847f1 Mon Sep 17 00:00:00 2001 From: ASLeonard Date: Fri, 6 Dec 2024 13:19:09 +0100 Subject: [PATCH 5/6] clean make_example args --- snakepit/deepvariant.smk | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/snakepit/deepvariant.smk b/snakepit/deepvariant.smk index 1ca8691..e5db4ab 100644 --- a/snakepit/deepvariant.smk +++ b/snakepit/deepvariant.smk @@ -27,19 +27,18 @@ cohort_samples = config['samples'] if 'glob' not in config['samples'] else glob_ #NOTE: may need to be updated if deepvariant changes its internal parameters. def make_custom_example_arguments(model): + common_long_read_options = '--alt_aligned_pileup "diff_channels" --parse_sam_aux_fields --partition_size "25000" --phase_reads --norealign_reads --sort_by_haplotypes --track_ref_reads --trim_reads_for_pileup --vsc_min_fraction_indels "0.12" ' match model: - case 'RNA' | 'WES': - return '--channels \'\' --split_skip_reads' - case 'PACBIO': - return '--alt_aligned_pileup "diff_channels" --max_reads_per_partition "600" --min_mapping_quality "1" --parse_sam_aux_fields --partition_size "25000" --phase_reads --pileup_image_width "147" --norealign_reads --sort_by_haplotypes --track_ref_reads --vsc_min_fraction_indels "0.12" --trim_reads_for_pileup' case 'WGS': - return '--channels "insert_size"' - case 'ONT_R104': - return '--alt_aligned_pileup "diff_channels" --max_reads_per_partition "600" --min_mapping_quality "5" --parse_sam_aux_fields --partition_size "25000" --phase_reads --pileup_image_width "99" --norealign_reads --sort_by_haplotypes --track_ref_reads --vsc_min_fraction_snps "0.08" --vsc_min_fraction_indels "0.12" --trim_reads_for_pileup' + return '--channel_list BASE_CHANNELS,insert_size' + case 'PACBIO': + return common_long_read_options + '--max_reads_per_partition "600" --min_mapping_quality "1" --pileup_image_width "147"' case 'MASSEQ': - return '--alt_aligned_pileup "diff_channels" --max_reads_per_partition 0 --min_mapping_quality 1 --parse_sam_aux_fields --partition_size "25000" --phase_reads --pileup_image_width "199" --norealign_reads --sort_by_haplotypes --track_ref_reads --vsc_min_fraction_indels 0.12 --trim_reads_for_pileup --max_reads_for_dynamic_bases_per_region "1500"' - case _: - return '--channels \'\' --split_skip_reads' + return common_long_read_options + '--max_reads_per_partition 0 --min_mapping_quality 1 --pileup_image_width "199" --max_reads_for_dynamic_bases_per_region "1500"' + case 'ONT_R104': + return common_long_read_options + '--max_reads_per_partition "600" --min_mapping_quality "5" --pileup_image_width "99" --vsc_min_fraction_snps "0.08"' + case 'RNA' | 'WES' | _: + return '--channel_list BASE_CHANNELS --split_skip_reads' def get_regions(region): if region == 'all': @@ -55,7 +54,6 @@ def get_regions(region): BAM_EXT = config.get('bam_index','.bai') -#error can be caused by corrupted file, check gzip -t -v rule deepvariant_make_examples: input: reference = multiext(config['reference'],'','.fai'), From 7a39da7e6c6138a443e4b160dfccae5bce90a11b Mon Sep 17 00:00:00 2001 From: ASLeonard Date: Wed, 15 Jan 2025 00:04:10 +0100 Subject: [PATCH 6/6] more updates to DV --- snakepit/deepvariant.smk | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/snakepit/deepvariant.smk b/snakepit/deepvariant.smk index e5db4ab..aea342e 100644 --- a/snakepit/deepvariant.smk +++ b/snakepit/deepvariant.smk @@ -11,6 +11,8 @@ if 'binding_paths' in config: for path in config['binding_paths']: workflow._singularity_args += f' -B {path}' +#TODO: generalise regions from input +#lean more on config and force users to be explicit, then less room for errors regions = config.get('regionsz',list(map(str,range(1,30))) + ['X','Y','MT','unplaced']) if config.get('scatter_merge',True): @@ -26,13 +28,14 @@ rule all: cohort_samples = config['samples'] if 'glob' not in config['samples'] else glob_wildcards(config["bam_path"] + config["samples"]["glob"]).sample #NOTE: may need to be updated if deepvariant changes its internal parameters. -def make_custom_example_arguments(model): +def make_custom_example_arguments(model,small=True): #TODO: allow switching on/off small model common_long_read_options = '--alt_aligned_pileup "diff_channels" --parse_sam_aux_fields --partition_size "25000" --phase_reads --norealign_reads --sort_by_haplotypes --track_ref_reads --trim_reads_for_pileup --vsc_min_fraction_indels "0.12" ' + small_model = '--small_model_snp_gq_threshold 25 --small_model_indel_gq_threshold 30 --small_model_vaf_context_window_size 51 ' #TODO: this does not work for ONTr10 match model: case 'WGS': - return '--channel_list BASE_CHANNELS,insert_size' + return small_model + '--channel_list BASE_CHANNELS,insert_size' case 'PACBIO': - return common_long_read_options + '--max_reads_per_partition "600" --min_mapping_quality "1" --pileup_image_width "147"' + return common_long_read_options + small_model '--max_reads_per_partition "600" --min_mapping_quality "1" --pileup_image_width "147"' case 'MASSEQ': return common_long_read_options + '--max_reads_per_partition 0 --min_mapping_quality 1 --pileup_image_width "199" --max_reads_for_dynamic_bases_per_region "1500"' case 'ONT_R104': @@ -90,12 +93,14 @@ rule deepvariant_make_examples: --task {wildcards.N} ''' -def get_checkpoint(model): +#TODO: need to update models? +def get_checkpoint(model,small=True): + path = f'/opt/{"small" if small else ""}models/' match model: - case 'PACBIO' | 'WGS': - return f'/opt/models/{model.lower()}' + case 'PACBIO' | 'MASSEQ' | 'WGS' | 'ONT_R104' : + return path + model.lower() case 'hybrid': - return '/opt/models/hybrid_pacbio_illumina' + return path + 'hybrid_pacbio_illumina' case _: return config['model'] @@ -158,7 +163,7 @@ rule deepvariant_postprocess: rule bcftools_scatter: input: gvcf = expand(rules.deepvariant_postprocess.output['gvcf'],region='all',allow_missing=True), - regions = '/cluster/work/pausch/vcf_UCD/2023_07/regions.bed', + regions = '/cluster/work/pausch/vcf_UCD/2023_07/regions.bed', # TODO: need to handle this for different references fai = config['reference'] + ".fai" output: expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz',region=regions,allow_missing=True), @@ -182,6 +187,7 @@ rule bcftools_scatter: done ''' +# see https://github.com/dnanexus-rnd/GLnexus/pull/310 def get_GL_config(preset): match preset: case 'DeepVariantWGS' | 'DeepVariant_unfiltered' | 'DeepVariantWES_MED_DP':