Snakefile_fastq_multilane.snake

include:
    #'configs/config_HuR.human.Penalva_L_01182017.py'
    #'configs/config_HuR.mouse.py'
    'configs/config_Feb_02_2017_Radiation_GBM.py'
    #'configs/config_Penalva_L_01182017.human.py'
    #'configs/config_Musashi1_Penalva_L_12212016.human.py'

workdir: OUT_DIR

from itertools import chain, combinations
from os.path import join
import glob
import re

"""
The idea behind this section is to get sample names and the corresponding lanes,
here I do it for filenames which look like this:
HepG2_Control_1_S7_L001_R1_001.fastq.gz
HepG2_Control_1_S7_L001_R2_001.fastq.gz
HepG2_Control_1_S7_L002_R1_001.fastq.gz
HepG2_Control_1_S7_L002_R2_001.fastq.gz
HepG2_Control_1_S7_L003_R1_001.fastq.gz
HepG2_Control_1_S7_L003_R2_001.fastq.gz
HepG2_Control_1_S7_L004_R1_001.fastq.gz
HepG2_Control_1_S7_L004_R2_001.fastq.gz

So the SAMPLE_NAME here is HepG2_Control_1_S7 which was sequenced(paired-end) in 4 lanes: L001-L004.

"""
SAMPLES = glob.glob('{}**/*.fastq.gz'.format(RAWDATA_DIR), recursive=True)
SAMPLE_LANE = []
for sample in SAMPLES:
    sample = sample.replace('{}/'.format(RAWDATA_DIR),'')
    sample_name = re.split(r'_L\d\d\d_', sample)[0]
    lane_name = re.search(r'L\d\d\d', sample).group()
    SAMPLE_LANE.append((sample_name, lane_name))

SAMPLE_LANE = set(SAMPLE_LANE)
SAMPLE_LANE = sorted(SAMPLE_LANE, key=lambda tup: tup[0])
SAMPLE_NAMES, LANE_NAMES = zip(*SAMPLE_LANE)

PLOT_PREFIXES = list(combinations(set(SAMPLE_NAMES), 2))
PLOT_PREFIXES_1 = [x[0] for x in PLOT_PREFIXES]
PLOT_PREFIXES_2 = [x[1] for x in PLOT_PREFIXES]

rule all:
    input:
        STAR_INDEX,
        EBV_STAR_INDEX,
        expand('qc/{sample_name}_{lane}_R1_001_fastqc.html', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('qc/{sample_name}_{lane}_R2_001_fastqc.html', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('preprocessed/{sample_name}/{sample_name}_{lane}_R1_001_val_1.fq.gz', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('preprocessed/{sample_name}/{sample_name}_{lane}_R2_001_val_2.fq.gz', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('mapped/bams/{sample_name}/{sample_name}_{lane}.bam', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('mapped_ebv/bams/{sample_name}/{sample_name}_{lane}.bam', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('mapped/merged_bams/{sample_name}.bam', sample_name=SAMPLE_NAMES),
        expand('mapped_ebv/merged_bams/{sample_name}.bam', sample_name=SAMPLE_NAMES),
        expand('mapped/HTSeq-counts/byCDS/{sample}.CDS.counts.tsv', sample=SAMPLE_NAMES),
        expand('mapped/HTSeq-counts/byExon/{sample}.exon.counts.tsv', sample=SAMPLE_NAMES),
        #expand('mapped_ebv/HTSeq-counts/byCDS/{sample}.CDS.counts.tsv', sample=SAMPLE_NAMES),
        expand('mapped_ebv/HTSeq-counts/byExon/{sample}.exon.counts.tsv', sample=SAMPLE_NAMES),
        expand('mapped/dexseq/{sample}.dexseq.counts.tsv', sample=SAMPLE_NAMES),
        'mapped/featureCounts/byCDS/fcounts.CDS.tsv',
        'mapped/featureCounts/byExon/fcounts.exon.tsv',
        'mapped_ebv/featureCounts/byCDS/fcounts.CDS.tsv',
        'mapped_ebv/featureCounts/byExon/fcounts.exon.tsv',
        #expand('mapped/intergenic_reads/{sample}.intergenic.bam', sample=SAMPLE_NAMES),
        #expand('inferred_experiment/{sample}.txt', sample=SAMPLE_NAMES),
        expand('mapped/dexseq/{sample}.dexseq.counts.tsv', sample=SAMPLE_NAMES),
        'multiqc_report/multiqc_report.html',

rule create_hhv4_index:
    input:
        EBV_FASTA,
        EBV_GTF
    output: EBV_STAR_INDEX
    threads: 16
    shell:
        r'''mkdir -p {output} && STAR --runThreadN 16\
            --runMode genomeGenerate \
            --genomeDir {output} \
            --genomeFastaFiles {input[0]}\
            --sjdbGTFfile {input[1]}'''

rule perform_qc:
    input:
        R1=RAWDATA_DIR+'/{sample_name}_{lane}_R1_001.fastq.gz',
        R2=RAWDATA_DIR+'/{sample_name}_{lane}_R2_001.fastq.gz'
    params:
        out_dir = 'qc'
    output:
       'qc/{sample_name}_{lane}_R1_001_fastqc.html',
       'qc/{sample_name}_{lane}_R1_001_fastqc.zip',
       'qc/{sample_name}_{lane}_R2_001_fastqc.html',
       'qc/{sample_name}_{lane}_R2_001_fastqc.zip',
    shell:
        r'''
            fastqc -o {params.out_dir} -f fastq {input.R1} {input.R2}
        '''

rule perfom_trimming:
    input:
        R1=RAWDATA_DIR+'/{sample_name}_{lane}_R1_001.fastq.gz',
        R2=RAWDATA_DIR+'/{sample_name}_{lane}_R2_001.fastq.gz'
    params:
        out_dir='preprocessed/{sample_name}',
        phred_cutoff=5
    output:
        'preprocessed/{sample_name}/{sample_name}_{lane}_R1_001_val_1.fq.gz',
        'preprocessed/{sample_name}/{sample_name}_{lane}_R2_001_val_2.fq.gz',
    shell:
        r'''
            trim_galore --paired -o {params.out_dir} -q {params.phred_cutoff} {input.R1} {input.R2}
        '''

"""This is optimised for >150bp reads:
--outFilterMultimapScoreRange 20 : the score range below the maximum score for multimapping alignments
--outFilterScoreMinOverLread 0 : alignment will output only if score is ghier than or equal to this value normalised for read length
--outFilterMatchNminOverLread 0.66 :  alignment will output only if number of matched bases is higher than or equal to his value normalised for read length
--outFilterMismatchNmax 100 : increases the number of allowed mismatches to 100 - need to allow more mismatches for longer reads
--winAnchorMultimapNmax 200 : Maximum number of loci anchors are allowed to map
--seedPerReadNmax 100000   --seedPerWindowNmax 100 : increase the number of allowed seeds for each read and alignment window - need to store more seeds for longer reads

## Not used
--seedSearchStartLmax 30 : increases the number of seed search start position in the read - important for reads with high error rate
--seedSearchLmax 30 : similar to the above, limits the maximum length of the seeds. Presently, I do not recommend changing this parameter
--alignTranscriptsPerReadNmax 100000 --alignTranscriptsPerWindowNmax 10000 : increase the number of allowed alignments for each read and alignment window - need to store more putative alignments for longer reads

"""
rule map_starlong:
    input:
        R1='preprocessed/{sample_name}/{sample_name}_{lane}_R1_001_val_1.fq.gz',
        R2='preprocessed/{sample_name}/{sample_name}_{lane}_R2_001_val_2.fq.gz',
        index=STAR_INDEX
    output:
        bam = 'mapped/bams/{sample_name}/{sample_name}_{lane}.bam',
        counts = 'mapped/STARcounts/{sample_name}/{sample_name}_{lane}.counts',
        fastq1 = 'unmapped_fastq/{sample_name}/{sample_name}_{lane}_R1.fq',
        fastq2 = 'unmapped_fastq/{sample_name}/{sample_name}_{lane}_R2.fq'
    params:
        prefix = 'mapped/bams/{sample_name}_{lane}',
        unmapped = 'unmapped_fastq/{sample_name}',
        starlogs = 'mapped/starlogs'
    threads: 16
    shell:
        r'''
        STARlong --runThreadN {threads}\
             --genomeDir {input.index}\
             --outFileNamePrefix {params.prefix} --readFilesIn {input.R1} {input.R2}\
             --outSAMtype BAM SortedByCoordinate\
             --readFilesCommand zcat\
             --outReadsUnmapped Fastx\
            --outFilterMultimapScoreRange 20\
            --outFilterScoreMinOverLread 0\
            --outFilterMatchNminOverLread 0.66\
            --outFilterMismatchNmax 100\
            --winAnchorMultimapNmax 200\
            --seedPerReadNmax 100000 --seedPerWindowNmax 100\
            --quantMode GeneCounts\
            && mv {params.prefix}Aligned.sortedByCoord.out.bam {output.bam}\
            && mv {params.prefix}ReadsPerGene.out.tab {output.counts}\
            && mkdir -p {params.starlogs} && mv {params.prefix}Log.final.out {params.prefix}Log.out {params.prefix}Log.progress.out {params.starlogs} \
            && mkdir -p {params.unmapped} && mv {params.prefix}Unmapped.out.mate1 {output.fastq1} && mv {params.prefix}Unmapped.out.mate2 {output.fastq2} && samtools index {output.bam}
        '''

## See: https://software.broadinstitute.org/gatk/guide/article?id=3060
## Merging should happen post alignment
rule merge_bams:
    input: expand('mapped/bams/{{sample_name}}/{{sample_name}}_{lane}.bam', lane=set(LANE_NAMES))
    output: 'mapped/merged_bams/{sample_name}.bam'
    run:
        cmd = ' -in '.join(input)
        shell(r'''bamtools merge -in {cmd} -out {output}''')

rule sort_by_name:
    input: 'mapped/merged_bams/{sample}.bam'
    output: 'mapped/merged_bams/{sample}.sortedByName.bam'
    shell:
        r'''
            samtools sort -on {input} -T /tmp/ -o {output}
        '''

rule count:
    input: 'mapped/merged_bams/{sample}.sortedByName.bam'
    params:
        annotation=GTF,
        phred_cutoff=5
    output: 'mapped/HTSeq-counts/byExon/{sample}.exon.counts.tsv'
    shell:
        r'''
        source activate python2 && htseq-count --order=name --format=bam --mode=intersection-strict --stranded={HTSEQ_STRANDED} --minaqual={params.phred_cutoff} --type=exon --idattr=gene_id {input} {params.annotation} > {output}.temp && cat {output}.temp | sed -E 's/\.[0-9]+//' > {output} && rm {output}.temp
        '''

rule count_byCDS:
    input: 'mapped/merged_bams/{sample}.sortedByName.bam'
    params:
        annotation=GTF,
        phred_cutoff=5
    output: 'mapped/HTSeq-counts/byCDS/{sample}.CDS.counts.tsv'
    shell:
        r'''
        source activate python2 && htseq-count --order=name --format=bam --mode=intersection-strict --stranded={HTSEQ_STRANDED} --minaqual={params.phred_cutoff} --type=CDS --idattr=gene_id {input} {params.annotation} > {output}.temp && cat {output}.temp | sed -E 's/\.[0-9]+//' > {output} && rm {output}.temp
        '''

rule infer_experiment:
    input: 'mapped/merged_bams/{sample}.bam'
    output: 'inferred_experiment/{sample}.txt'
    shell:
        r'''
        source activate clipseq2 && infer_experiment.py -r {GENE_BED_TSV} -i {input} 2>&1 > {output}
        '''

rule run_deseq:
    input: expand('mapped/counts_strict/{sample}.counts.noversion.tsv', sample=SAMPLE_NAMES)
    output:
        'mapped/DE_analysis/HTSeq/'+GENOME_BUILD+'.HTSeq.DESeq2.all.tsv',
        'mapped/DE_analysis/HTSeq/'+GENOME_BUILD+'.HTSeq.DESeq2.sig.tsv'

    params:
        basedir = 'mapped/counts_strict/',
        inprefix = 'counts.noversion',
        gene_annotations  = GENE_NAMES,
        outprefix = 'mapped/DE_analysis/HTSeq/'+GENOME_BUILD+'.HTSeq'
    shell:
        r'''
        Rscript {SRC_DIR}/do_DE_analysis.R --basedir={params.basedir} \
            --gene_annotations={params.gene_annotations} \
            --design_file={DESIGN_FILE} \
            --outprefix={params.outprefix} \
            --inprefix={params.inprefix}

        '''


rule featurecounts:
    input: expand('mapped/merged_bams/{sample}.sortedByName.bam', sample=set(SAMPLE_NAMES))
    params:
        annotation=GTF
    output: 'mapped/featureCounts/byExon/fcounts.exon.tsv'
    threads: 16
    shell:
        r'''featureCounts {FEATURECOUNTS_S} -a {params.annotation} -o {output}.temp -t exon -g gene_id -Q 4 -T {threads} {input} && cat {output}.temp | sed -E 's/\.[0-9]+//' > {output} && rm {output}.temp
        '''

rule featurecounts_cds:
    input: expand('mapped/merged_bams/{sample}.sortedByName.bam', sample=set(SAMPLE_NAMES))
    params:
        annotation=GTF
    output: 'mapped/featureCounts/byCDS/fcounts.CDS.tsv'
    threads: 16
    shell:
        r'''featureCounts {FEATURECOUNTS_S} -a {params.annotation} -o {output}.temp -t CDS -g gene_id -Q 4 -T {threads} {input} && cat {output}.temp | sed -E 's/\.[0-9]+//' > {output} && rm {output}.temp
        '''

rule run_deseq_featureCounts:
    input: 'mapped/featureCounts/fcounts.noversion.tsv'
    output:
        'mapped/DE_analysis/featureCounts/'+GENOME_BUILD+'.featureCounts.DESeq2.all.tsv',
        'mapped/DE_analysis/featureCounts/'+GENOME_BUILD+'.featureCounts.DESeq2.sig.tsv'
    params:
        gene_annotations  = GENE_NAMES,
        outprefix = 'mapped/DE_analysis/featureCounts/'+GENOME_BUILD+'.featureCounts'
    shell:
        r'''
        Rscript {SRC_DIR}/do_DE_analysis_featureCounts.R --counts={input} \
            --gene_annotations={params.gene_annotations} \
            --design_file={DESIGN_FILE} \
            --outprefix={params.outprefix}

        '''

rule run_picardmetrics:
    input: 'mapped/merged_bams/{sample}.bam'
    output: 'mapped/bam_metrics/{sample}.metrics'
    shell:
        r'''
        picard CollectInsertSizeMetrics I={input} H={output}.insertsize.pdf O={output}

        '''

rule create_insertsize_tsv:
    input: 'mapped/bam_metrics/{sample}.metrics'
    output: 'mapped/bam_metrics/{sample}.insertsizes.tsv'
    shell:
        r'''
        python {SRC_DIR}/collect_picard_metrics.py {input} {output}

        '''

rule counts_to_tpm:
    input:
        count = expand('mapped/counts_strict/{sample}.counts.noversion.tsv', sample=SAMPLE_NAMES),
        insert_size = expand('mapped/bam_metrics/{sample}.insertsizes.tsv', sample=SAMPLE_NAMES),
    output: 'mapped/tpm/HTSeq/masterTPM.tsv'
    params:
        gene_lengths=GENE_LENGTHS,
        name=expand('{sample}', sample=set(SAMPLE_NAMES)),
        outprefix='mapped/tpm/HTSeq',
        gene_map=GENE_NAMES
    run:
        counts_input = (',').join(input.count)
        sizes_input = (',').join(input.insert_size)
        names = (',').join(params.name)
        shell('Rscript {SRC_DIR}/counts_to_tpm.R --counts={counts_input} --insert_sizes={sizes_input} --gene_lengths={params.gene_lengths} --inprefix={names} --gene_map={params.gene_map} --outprefix={params.outprefix}')


rule featurecounts_to_tpm:
    input:
        insert_size = expand('mapped/bam_metrics/{sample}.insertsizes.tsv', sample=set(SAMPLE_NAMES)),
        fcounts = 'mapped/featureCounts/fcounts.noversion.tsv'
    output: 'mapped/tpm/featureCounts/masterTPM.tsv'
    params:
        gene_lengths=GENE_LENGTHS,
        name=expand('{sample}', sample=set(SAMPLE_NAMES)),
        outprefix='mapped/tpm/featureCounts',
        gene_map=GENE_NAMES
    run:
        sizes_input = (',').join(input.insert_size)
        names = (',').join(params.name)
        shell('Rscript {SRC_DIR}/featurecounts_to_tpm.R --counts={input.fcounts} --insert_sizes={sizes_input} --gene_lengths={params.gene_lengths} --inprefix={names} --gene_map={params.gene_map} --outprefix={params.outprefix}')


rule plot_tpm:
    input: 'mapped/tpm/HTSeq/masterTPM.tsv'
    output: 'mapped/plots/tpm_scatter/'
    params:
        prefix = 'mapped/plots/tpm_scatter/'
    shell:
        r'''
        export LC_ALL=en_US.UTF-8 && python {SRC_DIR}/plot_tpm_scatter.py --master {input} --outprefix  {params.prefix}
        '''

rule perform_qualimap_qc:
    input:  'mapped/merged_bams/{sample}.bam',
    output: 'mapped/post_mapping_qualimap/{sample}/qualimapReport.html',
    params:
        outdir='mapped/post_mapping_qualimap/{sample}',
        gtf=GTF
    shell:
        r'''
        qualimap rnaseq -bam {input} -gtf {params.gtf} --outdir {params.outdir} --java-mem-size=8G

        '''

rule get_duplication_estimate:
    input:  'mapped/merged_bams/{sample}.bam'
    output: 'mapped/post_mapping_deduplication/{sample}/output.DupRate_plot.r'
    params:
        outprefix='mapped/post_mapping_deduplication/{sample}/output'
    shell:
        r'''
        source activate clipseq2 && read_duplication.py -i {input} -o {params.outprefix}
        '''

rule run_multiqc:
    input:
        expand('qc/{sample_name}_{lane}_R1_001_fastqc.html', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('qc/{sample_name}_{lane}_R2_001_fastqc.html', zip, sample_name=SAMPLE_NAMES, lane=LANE_NAMES),
        expand('mapped/bam_metrics/{sample}.metrics', sample=SAMPLE_NAMES),
        #expand('mapped/post_mapping_deduplication/{sample}/output.DupRate_plot.r', sample=SAMPLE_NAMES),
        expand('mapped/post_mapping_qualimap/{sample}/qualimapReport.html', sample=SAMPLE_NAMES),
        expand('mapped/merged_bams/{sample}.bam', sample=SAMPLE_NAMES),
        #'mapped/DE_analysis/HTSeq/'+GENOME_BUILD+'.HTSeq.DESeq2.sig.tsv',
        #'mapped/DE_analysis/featureCounts/'+GENOME_BUILD+'.featureCounts.DESeq2.sig.tsv',
    output:
        'multiqc_report/multiqc_report.html'
    shell:
        'export LC_ALL=en_US.UTF-8 && multiqc -f --outdir multiqc_report .'

rule extract_intergenic_reads:
    input:
        bam='mapped/merged_bams/{sample}.bam'
    output: 'mapped/intergenic_reads/{sample}.intergenic.bam'
    shell:
        r'''
        samtools index {input.bam} && samtools view -bhL {INTERGENIC_BED} > {output}
        '''

rule prepare_annotation_for_dexseq:
    input: GTF
    output: 'mapped/dexseq/gencode.v25.annotation.DEXSeq.gff'
    shell:
        r'''source activate python2 && python /panfs/cmb-panasas2/skchoudh/software_frozen/anaconda27/envs/python2/lib/R/library/DEXSeq/python_scripts/dexseq_prepare_annotation.py {input} {output}
        '''

rule run_dexseq:
    input:
        bam='mapped/merged_bams/{sample}.sortedByName.bam',
        gff='mapped/dexseq/gencode.v25.annotation.DEXSeq.gff'
    output:
        'mapped/dexseq/{sample}.dexseq.counts.tsv'
    shell:
        r'''source activate python2 && python /panfs/cmb-panasas2/skchoudh/software_frozen/anaconda27/envs/python2/lib/R/library/DEXSeq/python_scripts/dexseq_count.py -f bam -s no -p yes -r name {input.gff} {input.bam} {output}
        '''

rule count_dexseq:
    input: 'mapped/merged_bams/{sample}.sortedByName.bam'
    output: 'mapped/counts_DEXSeq/{sample}.DEXSeq.counts.tsv'
    shell:
        r'''
        source activate clipseq2 && python {SRC_DIR}/external/DEXSeq/dexseq_count.py -p yes -s {HTSEQ_STRANDED} -r name -f bam {DEXSeq_GFF} {input} {output}
        '''

rule calc_fpkm:
    input: 'mapped/merged_bams/{sample}.bam'
    output: 'mapped/fpkm/{sample}.FPKM.xls'
    params:
        prefix='mapped/fpkm/{sample}'
    shell:
        r'''
        source activate clipseq2 && FPKM_count.py -i {input} -o {params.prefix} -r {GENE_BED}
        '''

rule map_ebv:
    input:
        R1 = 'unmapped_fastq/{sample_name}/{sample_name}_{lane}_R1.fq',
        R2 = 'unmapped_fastq/{sample_name}/{sample_name}_{lane}_R2.fq'
    output:
        'mapped_ebv/bams/{sample_name}/{sample_name}_{lane}.bam',
    params:
        prefix = 'mapped_ebv/{sample_name}_{lane}',
        starlogs = 'mapped_ebv/starlogs'
    threads: 16
    shell:
        r'''
        STAR --runThreadN {threads}\
             --genomeDir {EBV_STAR_INDEX}\
             --outFilterMismatchNmax 2\
             --outFileNamePrefix {params.prefix} --readFilesIn {input.R1} {input.R2}\
             --outSAMtype BAM Unsorted\
             && samtools sort -o {params.prefix}Aligned.sortedByCoord.out.bam -T /tmp/{wildcards.sample_name}_{wildcards.lane} {params.prefix}Aligned.out.bam && rm {params.prefix}Aligned.out.bam\
             && mv {params.prefix}Aligned.sortedByCoord.out.bam {output} && mkdir -p {params.starlogs} && mv {params.prefix}Log.final.out {params.prefix}Log.out {params.prefix}Log.progress.out {params.starlogs}\
             && samtools index {output}
        '''


rule merge_bams_ebv:
    input: expand('mapped_ebv/bams/{{sample_name}}/{{sample_name}}_{lane}.bam', lane=set(LANE_NAMES))
    output: 'mapped_ebv/merged_bams/{sample_name}.bam'
    run:
        cmd = ' -in '.join(input)
        shell(r'''bamtools merge -in {cmd} -out {output}''')

rule count_exon_ebv:
    input: 'mapped_ebv/merged_bams/{sample_name}.bam'
    params:
        annotation=EBV_GTF,
        phred_cutoff=5
    output: 'mapped_ebv/HTSeq-counts/byExon/{sample_name}.exon.counts.tsv'
    shell:
        r'''
        source activate python2 && htseq-count --order=pos --format=bam --mode=intersection-strict --stranded={HTSEQ_STRANDED} --minaqual={params.phred_cutoff} --type=exon --idattr=gene_id {input} {params.annotation} > {output}
        '''

rule count_byCDS_ebv:
    input: 'mapped_ebv/merged_bams/{sample_name}.bam'
    params:
        annotation=EBV_GTF,
        phred_cutoff=5
    output: 'mapped_ebv/HTSeq-counts/byCDS/{sample_name}.CDS.counts.tsv'
    shell:
        r'''
        source activate python2 && htseq-count --order=pos --format=bam --mode=intersection-strict --stranded={HTSEQ_STRANDED} --minaqual={params.phred_cutoff} --type=CDS --idattr=gene_id {input} {params.annotation} > {output}
        '''

rule featurecounts_ebv:
    input: expand('mapped_ebv/merged_bams/{sample}.bam', sample=set(SAMPLE_NAMES))
    params:
        annotation=EBV_GTF
    output: 'mapped_ebv/featureCounts/byExon/fcounts.exon.tsv'
    threads: 16
    shell:
        r'''featureCounts {FEATURECOUNTS_S} -a {params.annotation} -o {output} -t exon -g gene_id -Q 4 -T {threads} {input}
        '''

rule featurecounts_cds_ebv:
    input: expand('mapped_ebv/merged_bams/{sample}.bam', sample=set(SAMPLE_NAMES))
    params:
        annotation=EBV_GTF
    output: 'mapped_ebv/featureCounts/byCDS/fcounts.CDS.tsv'
    threads: 16
    shell:
        r'''featureCounts {FEATURECOUNTS_S} -a {params.annotation} -o {output} -t CDS -g gene_id -Q 4 -T {threads} {input}
        '''

rule merge_fastqs:
    input: expand(os.path.join(RAWDATA_DIR, '') + '{{sample_name}}_{lane}.fastq.gz', lane=set(LANE_NAMES))
    output: 'preprocessed/merged_fastq/{sample_name}.fastq.gz'
    shell:
        r'''zcat {input} > {output}'''

rule count_fastq_reads:
    input: 'preprocessed/merged_fastq/{sample_name}.fastq.gz'
    output: 'pickled_data/raw_fastq_reads/{sample_name}.txt'
    shell:
        r'''zcat {input} | echo $((`wc -l`/4)) > {output}'''