Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to DV version 1.8 #2

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 41 additions & 30 deletions snakepit/deepvariant.smk
Original file line number Diff line number Diff line change
@@ -2,17 +2,23 @@ from pathlib import Path,PurePath

wildcard_constraints:
region = r'\w+',
run = r'\w+'
run = r'\w+',
preset = r'|'.join(config['GL_config'])

config.setdefault('Run_name', 'DV_variant_calling')

if 'binding_paths' in config:
for path in config['binding_paths']:
workflow._singularity_args += f' -B {path}'

regions = list(map(str,range(1,30))) + ['X','Y','MT','unplaced']
#TODO: generalise regions from input
#lean more on config and force users to be explicit, then less room for errors
regions = config.get('regionsz',list(map(str,range(1,30))) + ['X','Y','MT','unplaced'])

ruleorder: deepvariant_postprocess > bcftools_scatter
if config.get('scatter_merge',True):
ruleorder: bcftools_scatter > deepvariant_postprocess
else:
ruleorder: deepvariant_postprocess > bcftools_scatter

rule all:
input:
@@ -22,16 +28,20 @@ rule all:
cohort_samples = config['samples'] if 'glob' not in config['samples'] else glob_wildcards(config["bam_path"] + config["samples"]["glob"]).sample

#NOTE: may need to be updated if deepvariant changes its internal parameters.
def make_custom_example_arguments(model):
def make_custom_example_arguments(model,small=True): #TODO: allow switching on/off small model
common_long_read_options = '--alt_aligned_pileup "diff_channels" --parse_sam_aux_fields --partition_size "25000" --phase_reads --norealign_reads --sort_by_haplotypes --track_ref_reads --trim_reads_for_pileup --vsc_min_fraction_indels "0.12" '
small_model = '--small_model_snp_gq_threshold 25 --small_model_indel_gq_threshold 30 --small_model_vaf_context_window_size 51 ' #TODO: this does not work for ONTr10
match model:
case 'RNA' | 'WES':
return '--channels \'\' --split_skip_reads'
case 'PACBIO':
return '--add_hp_channel --alt_aligned_pileup "diff_channels" --max_reads_per_partition "600" --min_mapping_quality "1" --parse_sam_aux_fields --partition_size "25000" --phase_reads --pileup_image_width "199" --norealign_reads --sort_by_haplotypes --track_ref_reads --vsc_min_fraction_indels "0.12"'
case 'WGS':
return '--channels "insert_size"'
case _:
return '--channels \'\' --split_skip_reads'
return small_model + '--channel_list BASE_CHANNELS,insert_size'
case 'PACBIO':
return common_long_read_options + small_model '--max_reads_per_partition "600" --min_mapping_quality "1" --pileup_image_width "147"'
case 'MASSEQ':
return common_long_read_options + '--max_reads_per_partition 0 --min_mapping_quality 1 --pileup_image_width "199" --max_reads_for_dynamic_bases_per_region "1500"'
case 'ONT_R104':
return common_long_read_options + '--max_reads_per_partition "600" --min_mapping_quality "5" --pileup_image_width "99" --vsc_min_fraction_snps "0.08"'
case 'RNA' | 'WES' | _:
return '--channel_list BASE_CHANNELS --split_skip_reads'

def get_regions(region):
if region == 'all':
@@ -47,7 +57,6 @@ def get_regions(region):

BAM_EXT = config.get('bam_index','.bai')

#error can be caused by corrupted file, check gzip -t -v
rule deepvariant_make_examples:
input:
reference = multiext(config['reference'],'','.fai'),
@@ -84,12 +93,14 @@ rule deepvariant_make_examples:
--task {wildcards.N}
'''

def get_checkpoint(model):
#TODO: need to update models?
def get_checkpoint(model,small=True):
path = f'/opt/{"small" if small else ""}models/'
match model:
case 'PACBIO' | 'WGS':
return f'/opt/models/{model.lower()}'
case 'PACBIO' | 'MASSEQ' | 'WGS' | 'ONT_R104' :
return path + model.lower()
case 'hybrid':
return '/opt/models/hybrid_pacbio_illumina'
return path + 'hybrid_pacbio_illumina'
case _:
return config['model']

@@ -127,7 +138,8 @@ rule deepvariant_postprocess:
gvcf = multiext('{run}/deepvariant/{sample}.{region}.g.vcf.gz','','.tbi')
params:
variants = lambda wildcards,input: PurePath(input.variants).with_name('call_variants_output@1.tfrecord.gz'),
gvcf = lambda wildcards,input: PurePath(input.gvcf[0]).with_suffix('').with_suffix(f'.tfrecord@{config["shards"]}.gz')
gvcf = lambda wildcards,input: PurePath(input.gvcf[0]).with_suffix('').with_suffix(f'.tfrecord@{config["shards"]}.gz'),
handle_sex_chromosomes = f'--haploid_contigs "X,Y" --par_regions_bed "{config["haploid_sex"]}"' if 'haploid_sex' in config else ''
threads: config.get('resources',{}).get('postprocess',{}).get('threads',2)
resources:
mem_mb = config.get('resources',{}).get('postprocess',{}).get('mem_mb',20000),
@@ -144,16 +156,20 @@ rule deepvariant_postprocess:
--gvcf_outfile {output.gvcf[0]} \
--nonvariant_site_tfrecord_path {params.gvcf} \
--novcf_stats_report \
{params.handle_sex_chromosomes} \
--cpus {threads}
'''

rule bcftools_scatter:
input:
gvcf = expand(rules.deepvariant_postprocess.output['gvcf'],region='all',allow_missing=True),
regions = '/cluster/work/pausch/vcf_UCD/2023_07/regions.bed'
regions = '/cluster/work/pausch/vcf_UCD/2023_07/regions.bed', # TODO: need to handle this for different references
fai = config['reference'] + ".fai"
output:
expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz',region=regions,allow_missing=True),
expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.csi',region=regions,allow_missing=True)
expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.tbi',region=regions,allow_missing=True)
wildcard_constraints:
region = "(?!all)"
params:
regions = regions,
_dir = lambda wildcards, output: PurePath(output[0]).with_suffix('').with_suffix('').with_suffix('').with_suffix('')
@@ -163,14 +179,15 @@ rule bcftools_scatter:
walltime = '1h'
shell:
'''
bcftools +scatter {input.gvcf[0]} -o {params._dir} -Oz --threads {threads} --write-index -S {input.regions} -x unplaced --no-version
bcftools +scatter {input.gvcf[0]} -o {params._dir} -Oz --threads {threads} -S {input.regions} -x unplaced --no-version
for R in {params.regions}
do
mv {params._dir}/$R.vcf.gz {params._dir}.$R.g.vcf.gz
mv {params._dir}/$R.vcf.gz.csi {params._dir}.$R.g.vcf.gz.csi
bcftools reheader -f {input.fai} {params._dir}/$R.vcf.gz > {params._dir}.$R.g.vcf.gz
tabix -p vcf {params._dir}.$R.g.vcf.gz
done
'''

# see https://github.com/dnanexus-rnd/GLnexus/pull/310
def get_GL_config(preset):
match preset:
case 'DeepVariantWGS' | 'DeepVariant_unfiltered' | 'DeepVariantWES_MED_DP':
@@ -180,8 +197,6 @@ def get_GL_config(preset):

rule GLnexus_merge:
input:
#gvcfs = expand('{run}/deepvariant/{region}/{sample}.g.vcf.gz',sample=cohort_samples,allow_missing=True),
#tbi = expand('{run}/deepvariant/{region}/{sample}.g.vcf.gz.csi',sample=cohort_samples,allow_missing=True)
gvcfs = expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz',sample=cohort_samples,allow_missing=True),
tbi = expand('{run}/deepvariant/{sample}.{region}.g.vcf.gz.tbi',sample=cohort_samples,allow_missing=True)
output:
@@ -195,17 +210,13 @@ rule GLnexus_merge:
walltime = config.get('resources',{}).get('merge',{}).get('walltime','4h'),
scratch = '50G'
priority: 25
container: config.get('containers',{}).get('GLnexus','docker://ghcr.io/dnanexus-rnd/glnexus:v1.4.3')
shell:
'''
/usr/local/bin/glnexus_cli \
glnexus_cli \
--dir $TMPDIR/GLnexus.DB \
--config {params.preset} \
--threads {threads} \
--mem-gbytes {params.mem} \
{input.gvcfs} |\
bcftools view - |\
bgzip -@ 8 -c > {output[0]}

tabix -p vcf {output[0]}
bcftools view -@ {threads} -W=tbi {output[0]}
'''