diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0a26f0ed..cdf23457 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,11 +61,13 @@ jobs: NXF_VER: '20.07.1' NXF_ANSI_LOG: false strategy: + fail-fast: false matrix: profile_flags: - "test --sketch_scaled false --sketch_scaled_log2 2" - "test --sketch_scaled false --sketch_num_hashes 20" - "test --sketch_scaled false --sketch_num_hashes_log2 20" + - "test_bam" - "test_bam --barcodes_file false --rename_10x_barcodes false --save_fastas false --write_barcodes_meta_csv false" - "test_bam --rename_10x_barcodes false --write_barcodes_meta_csv false" - "test_bam --skip_sig_merge" @@ -73,6 +75,9 @@ jobs: - "test_bam --barcodes_file false --rename_10x_barcodes false" - "test_bam --rename_10x_barcodes false" - "test_fastas" + - "test_constitutive_from_download_refseq" + - "test_constitutive_from_fasta" + - "test_constitutive_from_sig" - "test_protein_fastas" - "test_remove_ribo" - "test_sig_merge" diff --git a/Dockerfile b/Dockerfile index 46764af7..a0cb55b8 100755 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,8 @@ LABEL authors="Olga Botvinnik" \ # Install the conda environment COPY environment.yml / -RUN conda env create --quiet -f /environment.yml && conda clean -a +RUN conda install -c conda-forge mamba +RUN mamba env create -f /environment.yml && mamba clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH @@ -12,6 +13,16 @@ ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH # Dump the details of the installed packages to a file for posterity RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0dev.yml +# Install super fast rust code to remove nuisance hashes (e.g. ribosomal) from signatures +RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git +# Soft link all conda C-related libraries to their non-prefixed name +# for rust to be able to build the C libraries +RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); \ + do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g; ln -s $f $g ; done +RUN cd 2021-01-27-olga-remove-protein && cargo build --release +# Add "subtract" command to path +ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH + # Instruct R processes to use these empty files instead of clashing with a local version RUN touch .Rprofile RUN touch .Renviron diff --git a/bin/filter_fasta_regex.py b/bin/filter_fasta_regex.py new file mode 100755 index 00000000..ebded98a --- /dev/null +++ b/bin/filter_fasta_regex.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +import argparse +import re + + +import screed + + +def write_records_to_fasta(records, fasta): + with open(fasta, "w") as f: + for record in records: + f.write(f'>{record["name"]}\n{record["sequence"]}\n') + + +def filter_records(fasta, pattern): + filtered_records = [] + with screed.open(fasta) as records: + for record in records: + name = record["name"] + if re.findall(pattern, name, flags=re.I): + filtered_records.append(record) + return filtered_records + + +def filter_fasta_with_regex(fasta_to_filter, out_fasta, regex): + record_subset = filter_records(fasta_to_filter, regex) + write_records_to_fasta(record_subset, out_fasta) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Extract sequences whose names match a pattern""" + ) + parser.add_argument("--input-fasta", type=str, help="Sequence file to filter") + parser.add_argument("--output-fasta", type=str, help="File to write") + parser.add_argument( + "--regex-pattern", + type=str, + help="Regular expression pattern to match for the names of seuqences in the file", + ) + args = parser.parse_args() + + filter_fasta_with_regex(args.input_fasta, args.output_fasta, args.regex_pattern) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 551861e1..80f39c09 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -14,8 +14,10 @@ "SKA": ["v_ska.txt", r"SKA Version: (\S+)"], "htslib": ["v_samtools.txt", r"htslib (\S+)"], "Sourmash": ["v_sourmash.txt", r"sourmash (\S+)"], - "SortMeRNA": ["v_sortmerna.txt", r"SortMeRNA version (\S+),"], + "Rsync": ["v_rsync.txt", r"rsync version (\S+)"], + "Rsync (Protocol)": ["v_rsync.txt", r"protocol version (\S+)"], "orpheum": ["v_orpheum.txt", r"Version: (\S+)"], + "Python": ["v_python.txt", r"Python (\S+)"], } results = OrderedDict() results["nf-core/kmermaid"] = 'N/A' @@ -25,11 +27,13 @@ results["bam2fasta"] = 'N/A' results["fastp"] = 'N/A' results["htslib"] = 'N/A' +results["orpheum"] = 'N/A' +results["Python"] = 'N/A' +results["Rsync"] = 'N/A' +results["Rsync (Protocol)"] = 'N/A' results["Samtools"] = 'N/A' results["SKA"] = 'N/A' results["Sourmash"] = 'N/A' -results["SortMeRNA"] = 'N/A' -results["orpheum"] = 'N/A' # Search each file using its regex for k, v in regexes.items(): diff --git a/bin/validate_sketch_value.py b/bin/validate_sketch_value.py index d96cff7f..497d92bd 100755 --- a/bin/validate_sketch_value.py +++ b/bin/validate_sketch_value.py @@ -20,7 +20,7 @@ def get_sketch_value(value, value_log2): if "," in value: logger.exception( f"Can only provide a single number to --sketch_num_hashes or" - f" --sketch_scaled. Provided '{value}" + f" --sketch_scaled. Provided '{value}'" ) sketch_value = int(value) else: diff --git a/conf/base.config b/conf/base.config index 07a2aa3b..01e0ffa3 100644 --- a/conf/base.config +++ b/conf/base.config @@ -54,6 +54,7 @@ process { withName: 'multiqc|get_software_versions' { memory = { check_max( 2.GB * task.attempt, 'memory' ) } + errorStrategy = "ignore" cache = false } withName: 'sourmash_compute_sketch_fastx_nucleotide|sourmash_compute_sketch_fastx_peptide' { diff --git a/conf/test.config b/conf/test.config index 4d4dced7..cf19689a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -17,7 +17,6 @@ params { // Input data // samples = 'testing/samples.csv' // fastas = 'testing/fastas/*.fasta' - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" @@ -29,4 +28,8 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] + // Remove constitutively expressed genes + test_mini_refseq_download = true + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_bam.config b/conf/test_bam.config index 8bcdb775..07579d50 100644 --- a/conf/test_bam.config +++ b/conf/test_bam.config @@ -19,7 +19,6 @@ params { 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" @@ -28,4 +27,6 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 2 + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_constitutive_from_download_refseq.config b/conf/test_constitutive_from_download_refseq.config new file mode 100644 index 00000000..886a8424 --- /dev/null +++ b/conf/test_constitutive_from_download_refseq.config @@ -0,0 +1,32 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/kmermaid -profile test + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // Input data + input_paths = [ + ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']], + ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']], + ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], + ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], + ] + + // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/ + // Protein fasta is 453 B + refseq_taxonomy = 'vertebrate_mammalian' + test_mini_refseq_download = true +} diff --git a/conf/test_constitutive_from_fasta.config b/conf/test_constitutive_from_fasta.config new file mode 100644 index 00000000..ea757073 --- /dev/null +++ b/conf/test_constitutive_from_fasta.config @@ -0,0 +1,32 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/kmermaid -profile test + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // Input data + input_paths = [ + ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']], + ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']], + ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], + ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], + ] + constitutive_protein_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa.gz" + constitutive_rna_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa.gz" + + translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa' + bloomfilter_tablesize = '1e6' +} diff --git a/conf/test_constitutive_from_sig.config b/conf/test_constitutive_from_sig.config new file mode 100644 index 00000000..0e2bad4d --- /dev/null +++ b/conf/test_constitutive_from_sig.config @@ -0,0 +1,29 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/kmermaid -profile test + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // Input data + input_paths = [ + ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']], + ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']], + ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], + ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], + ] + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" +} diff --git a/conf/test_fastas.config b/conf/test_fastas.config index a6509d4e..b439e03c 100644 --- a/conf/test_fastas.config +++ b/conf/test_fastas.config @@ -17,7 +17,6 @@ params { // Input data // samples = 'testing/samples.csv' // fastas = 'testing/fastas/*.fasta' - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" @@ -26,4 +25,6 @@ params { ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']], ] + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_full.config b/conf/test_full.config index 5dfaeafb..ac6e6677 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -12,10 +12,12 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' input_paths = [ ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']], ['K562', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_2.fastq.gz', 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_2.fastq.gz']] ] + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + } diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config index ea22bcb6..91a2325d 100644 --- a/conf/test_protein_fastas.config +++ b/conf/test_protein_fastas.config @@ -26,8 +26,8 @@ params { ['https://github.com/czbiohub/test-datasets/raw/predictorthologs/testdata/bonobo_liver_ptprc__molecule-dayhoff__coding_reads_peptides.fasta']]] // Sketch Parameters - sketch_scaled = 2 molecules = 'protein,dayhoff,hp' read_pairs = false - + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config index 8aa689ac..72c2710b 100644 --- a/conf/test_remove_ribo.config +++ b/conf/test_remove_ribo.config @@ -17,7 +17,6 @@ params { // Input data // samples = 'testing/samples.csv' // fastas = 'testing/fastas/*.fasta' - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" @@ -31,4 +30,6 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config index 21a27939..ad821450 100644 --- a/conf/test_sig_merge.config +++ b/conf/test_sig_merge.config @@ -18,7 +18,6 @@ params { bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" @@ -29,4 +28,6 @@ params { reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' bloomfilter_tablesize = '1e6' + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config index 39b9b2f0..10ae33ab 100644 --- a/conf/test_tenx_tgz.config +++ b/conf/test_tenx_tgz.config @@ -20,14 +20,13 @@ params { 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid-unaligned-tgz-v3/testdata/mouse_brown_fat_ptprc_plus_unaligned.tgz' ] // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" save_intermediate_files = "/tmp/" write_barcode_meta_csv = "metadata.csv" // For bam, each fasta record represents each barcode and each should have a signature - // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 10 - shard_size = 350 + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_translate.config b/conf/test_translate.config index c6e488a5..c4a7bccd 100644 --- a/conf/test_translate.config +++ b/conf/test_translate.config @@ -17,12 +17,15 @@ params { // Input data fastas = "https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_transcripts.subsample5.fa" // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false - reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa' + translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa' bloomfilter_tablesize = '1e8' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' + + // Remove constitutively expressed genes + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index 15365382..4f8a5487 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -18,17 +18,18 @@ params { bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" write_barcode_meta_csv = "metadata.csv" // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag - tenx_min_umi_per_cell = 5 + tenx_min_umi_per_cell = 2 - reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' + translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' bloomfilter_tablesize = '1e6' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/environment.yml b/environment.yml index f056cbf3..093d9644 100644 --- a/environment.yml +++ b/environment.yml @@ -7,6 +7,7 @@ channels: - defaults - anaconda dependencies: + - conda-forge::cmake=3.19.6 - conda-forge::python=3.7.3 - conda-forge::markdown=3.1.1 - conda-forge::pymdown-extensions=6.0 @@ -14,6 +15,7 @@ dependencies: - conda-forge::tqdm=4.43.0 - conda-forge::gxx_linux-64=7.3.0 - conda-forge::s3fs=0.4.2 + - conda-forge::rust=1.48.0 - bioconda::sourmash=3.5.0 - bioconda::samtools=1.10 - bioconda::screed=1.0.4 @@ -33,8 +35,8 @@ dependencies: - ska=1.0 - sphinx=2.3.1 - jupyter=1.0.0 - - sortmerna=2.1b # for metatranscriptomics - ripgrep=12.1.1 + - rsync=3.2.3 - pip: - bam2fasta==1.0.8 - orpheum==1.0.4 \ No newline at end of file diff --git a/environment_osx.yml b/environment_osx.yml index ad794f08..07d462c2 100644 --- a/environment_osx.yml +++ b/environment_osx.yml @@ -33,8 +33,9 @@ dependencies: - ska=1.0 - sphinx=2.3.1 - jupyter=1.0.0 - - sortmerna=2.1b # for metatranscriptomics - ripgrep=12.1.1 + - conda-forge::rust=1.48.0 + - rsync=3.2.3 - pip: - bam2fasta==1.0.8 - - orpheum==1.0.4 \ No newline at end of file + - orpheum==1.0.4 diff --git a/main.nf b/main.nf index 94756edf..0d4890a9 100644 --- a/main.nf +++ b/main.nf @@ -120,7 +120,7 @@ def helpMessage() { to new name, e.g. with channel or cell annotation label Translate RNA-seq reads into protein-coding sequences options: - --reference_proteome_fasta Path to a well-curated fasta file of protein sequences. Used to filter for coding reads + --translate_proteome_fasta Path to a well-curated fasta file of protein sequences. Used to filter for coding reads --translate_peptide_ksize K-mer size to use for translating RNA into protein. Default: 9, which is good for 'protein'. If using dayhoff, suggest 15 --translate_peptide_molecule Which molecular encoding to use for translating. Default: "protein" @@ -324,10 +324,10 @@ if (params.protein_fastas){ ch_protein_fastas = Channel.empty() } -if (params.reference_proteome_fasta) { -Channel.fromPath(params.reference_proteome_fasta, checkIfExists: true) - .ifEmpty { exit 1, "Reference proteome file not found: ${params.reference_proteome_fasta}" } - .set{ ch_reference_proteome_fasta } +if (params.translate_proteome_fasta) { +Channel.fromPath(params.translate_proteome_fasta, checkIfExists: true) + .ifEmpty { exit 1, "Reference proteome file not found: ${params.translate_proteome_fasta}" } + .set{ ch_translate_proteome_fasta } } //////////////////////////////////////////////////// @@ -388,12 +388,12 @@ if (!protein_input) { if (params.subsample && params.skip_trimming ) { subsample_reads_ch_unchecked .ifEmpty{ exit 1, "No reads provided! Check read input files" } - .set { subsample_ch_reads_for_ribosomal_removal } + .set { subsample_ch_reads_to_translate } } if (params.skip_trimming && !(params.bam || params.tenx_tgz)) { reads_ch_unchecked .ifEmpty{ exit 1, "No reads provided! Check read input files" } - .set { ch_reads_for_ribosomal_removal } + .set { ch_reads_to_translate } ch_read_files_trimming_to_check_size = Channel.empty() } else if (params.bam || params.tenx_tgz) { ch_non_bam_reads_unchecked @@ -409,11 +409,11 @@ if (!protein_input) { // Since there exists protein input, don't check if these are empty if (params.subsample) { subsample_reads_ch_unchecked - .set { subsample_ch_reads_for_ribosomal_removal } + .set { subsample_ch_reads_to_translate } } if (params.skip_trimming) { reads_ch_unchecked - .set { ch_reads_for_ribosomal_removal } + .set { ch_reads_to_translate } ch_read_files_trimming_to_check_size = Channel.empty() } else if (!have_nucleotide_fasta_input) { ch_read_files_trimming_unchecked @@ -432,15 +432,6 @@ if (params.split_kmer){ params.ksizes = '21,27,33,51' } -// Get rRNA databases -// Default is set to bundled DB list in `assets/rrna-db-defaults.txt` - -rRNA_database = file(params.rrna_database_manifest) -if (rRNA_database.isEmpty()) {exit 1, "File ${rRNA_database.getName()} is empty!"} -Channel - .from( rRNA_database.readLines() ) - .map { row -> file(row) } - .set { sortmerna_fasta } // --- Parse Translate parameters --- save_translate_csv = params.save_translate_csv @@ -450,18 +441,35 @@ save_translate_json = params.save_translate_json // --- Parse the Sourmash parameters ---- ksizes = params.ksizes?.toString().tokenize(',') Channel.from(params.ksizes?.toString().tokenize(',')) - .into { ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide } + .into { ch_ksizes_for_nucleotide; ch_ksizes_for_peptide; ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide } molecules = params.molecules?.toString().tokenize(',') +nucleotide_molecules = molecules.findAll { it == "dna" } peptide_molecules = molecules.findAll { it != "dna" } +// have_protein_input = params.translate_proteome_fasta || params.protein_fastas || protein_input +// peptide_molecules = peptide_molecules_comma_separated = peptide_molecules.join(",") peptide_molecule_flags = peptide_molecules.collect { it -> "--${it}" }.join ( " " ) Channel.from( molecules ) .set { ch_molecules } +Channel.from( nucleotide_molecules ) + .into { ch_nucleotide_molecules; ch_nucleotide_molecules_for_subtract; ch_nucleotide_molecules_for_compare } + Channel.from( peptide_molecules ) - .into { ch_peptide_molecules; ch_peptide_molecules_for_compare } + .into { ch_peptide_molecules; ch_peptide_molecules_for_subtract; ch_peptide_molecules_for_compare } + + +ch_peptide_molecules + .combine( ch_ksizes_for_peptide ) + .set { ch_sourmash_params_peptide } + +ch_nucleotide_molecules + .combine( ch_ksizes_for_nucleotide ) + .mix ( ch_sourmash_params_peptide ) + .dump ( tag: 'ch_sourmash_params' ) + .into { ch_sourmash_params_for_compare ; ch_sourmash_params_for_subtract } // Parse sketch value and style parameters sketch_num_hashes = params.sketch_num_hashes @@ -531,6 +539,75 @@ else { barcode_metadata_folder = "barcode_metadata" } + +////////////////////////////////////////////////////////// +/* -- Parse constitutive K-mer removal parameters -- */ +///////////////////////////////////////////////////////// +constitutive_protein_fasta = params.constitutive_protein_fasta +constitutive_rna_fasta = params.constitutive_rna_fasta + +constitutive_protein_sig = params.constitutive_protein_sig +constitutive_rna_sig = params.constitutive_rna_sig + +have_constitutive_fastas = constitutive_protein_fasta && constitutive_rna_fasta +have_constitutive_sigs = constitutive_protein_sig && constitutive_rna_sig +need_refseq_download = (!have_constitutive_fastas) && (!have_constitutive_sigs) + +if (have_constitutive_fastas) { + Channel.from( + ["protein", file(constitutive_protein_fasta)], + ["rna", file(constitutive_rna_fasta)]) + .into { ch_constitutive_fasta; ch_refseq_moltype_to_fasta } + + ch_refseq_moltype_to_fasta + // Check if protein molecules were even specified + .filter{ + it[0] == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 + } + // Take only the first item, the molecule type + .map{ it[0] } + .set{ ch_refseq_moltypes_to_download } +} else { + // Don't look at the fastas, only check the parsed molecule types + Channel.from(['protein', 'rna']) + .filter{ + it == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 + } + .set{ ch_refseq_moltypes_to_download } +} + +if (have_constitutive_sigs) { + // Use sourmash moltypes of "protein,dayhoff" instead of the original protein + // as used for the fastas as that's what matches the sourmash outputs + Channel.from( + ["protein,dayhoff", file(constitutive_protein_sig)], + ["dna", file(constitutive_rna_sig)]) + .set { ch_constitutive_sig } + + // Refseq molecule types are "protein" and "rna" + Channel.from( + ["protein", file(constitutive_protein_sig)], + ["rna", file(constitutive_rna_sig)]) + .set { ch_refseq_moltype_to_sig } + + ch_refseq_moltype_to_sig + // Check if protein molecules were even specified + .filter{ + it[0]== "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 + } + // Take only the first item, the molecule type + .map{ it[0] } + .set{ ch_refseq_moltypes_to_download } +} + + +// Parse refseq taxonomy group to download +constitutive_refseq_taxonomy = params.constitutive_refseq_taxonomy +///////////////////////////////////////////////////////////// +/* -- END: Parse constitutive K-mer removal parameters -- */ +///////////////////////////////////////////////////////////// + + // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name custom_runName = params.name @@ -588,16 +665,22 @@ if (params.sketch_num_hashes_log2) summary['Sketch Sizes (log2)'] = params. if (params.sketch_scaled) summary['Sketch scaled'] = params.sketch_scaled if (params.sketch_scaled_log2) summary['Sketch scaled (log2)'] = params.sketch_scaled_log2 // 10x parameters -if(params.tenx_tgz) summary["10x .tgz"] = params.tenx_tgz -if(params.tenx_tgz) summary["10x SAM tags"] = params.tenx_tags -if(params.tenx_tgz) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern -if(params.tenx_tgz) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern -if(params.tenx_tgz) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell -// Extract coding parameters -if(params.reference_proteome_fasta) summary["Peptide fasta"] = params.reference_proteome_fasta -if(params.reference_proteome_fasta) summary['Peptide ksize'] = params.translate_peptide_ksize -if(params.reference_proteome_fasta) summary['Peptide molecule'] = params.translate_peptide_molecule -if(params.reference_proteome_fasta) summary['Bloom filter table size'] = params.bloomfilter_tablesize +if(params.tenx_tgz || params.bam) summary["10x .tgz"] = params.tenx_tgz +if(params.tenx_tgz || params.bam) summary["10x SAM tags"] = params.tenx_tags +if(params.tenx_tgz || params.bam) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern +if(params.tenx_tgz || params.bam) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern +if(params.tenx_tgz || params.bam) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell +// Orpheum Translate parameters +if(params.translate_proteome_fasta) summary["Orpheum Translate Peptide fasta"] = params.translate_proteome_fasta +if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide ksize'] = params.translate_peptide_ksize +if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide molecule'] = params.translate_peptide_molecule +if(params.translate_proteome_fasta) summary['Oprheum Translate Bloom filter table size'] = params.bloomfilter_tablesize +// constitutive k-mer removal paramters +if(params.constitutive_protein_fasta) summary["Constitutive Peptide fasta"] = params.constitutive_protein_fasta +if(params.constitutive_rna_fasta) summary["Constitutive RNA fasta"] = params.constitutive_rna_fasta +if(params.constitutive_protein_sig) summary["Constitutive Peptide K-mer Signature"] = params.constitutive_protein_sig +if(params.constitutive_rna_sig) summary["Constitutive RNA K-mer Signature"] = params.constitutive_rna_sig +if(need_refseq_download) summary["Constitutive GBenes' Refseq Taxonomy"] = params.constitutive_refseq_taxonomy // Resource information summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" @@ -669,10 +752,11 @@ process get_software_versions { bam2fasta info &> v_bam2fasta.txt fastp --version &> v_fastp.txt samtools --version &> v_samtools.txt + rsync --version &> v_rsync.txt ska version &> v_ska.txt - sortmerna --version &> v_sortmerna.txt sourmash -v &> v_sourmash.txt pip show orpheum &> v_orpheum.txt + python --version &> v_python.txt scrape_software_versions.py &> software_versions_mqc.yaml """ } @@ -743,7 +827,7 @@ if ( !params.split_kmer && have_sketch_value ) { -if (params.reference_proteome_fasta){ +if (params.translate_proteome_fasta){ process make_protein_index { tag "${peptides}__${bloom_id}" label "low_memory" @@ -751,7 +835,7 @@ if (params.reference_proteome_fasta){ publishDir "${params.outdir}/protein_index", mode: params.publish_dir_mode input: - file(peptides) from ch_reference_proteome_fasta + file(peptides) from ch_translate_proteome_fasta translate_peptide_ksize translate_peptide_molecule @@ -853,8 +937,8 @@ if (params.tenx_tgz || params.bam) { // Put fastqs from aligned and unaligned reads into a single channel tenx_reads_aligned_concatenation_ch .mix( tenx_reads_unaligned_ch ) - .dump(tag: "tenx_ch_reads_for_ribosomal_removal") - .set{ tenx_ch_reads_for_ribosomal_removal } + .dump(tag: "tenx_ch_reads_to_translate") + .set{ tenx_ch_reads_to_translate } if ((params.tenx_min_umi_per_cell > 0) || !params.barcodes_file) { process count_umis_per_cell { @@ -900,14 +984,14 @@ if (params.tenx_tgz || params.bam) { good_barcodes_ch = tenx_bam_barcodes_ch } - tenx_ch_reads_for_ribosomal_removal + tenx_ch_reads_to_translate .combine( good_barcodes_ch, by: 0 ) - .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch' ) + .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch' ) .map{ it -> [it[0], it[1], it[2], it[3].splitText()] } .transpose() - .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch__transpose' ) + .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch__transpose' ) .map{ it -> [it[0], it[1], it[2], it[3].replaceAll("\\s+", "") ] } - .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch__transpose__no_newlines' ) + .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch__transpose__no_newlines' ) .set{ tenx_reads_with_good_barcodes_ch } process extract_per_cell_fastqs { @@ -949,8 +1033,8 @@ if (params.tenx_tgz || params.bam) { // // Filtering out fastq.gz files less than 200 bytes (arbitary number) // // ~200 bytes is about the size of a file with a single read or less // // We can't use .size() > 0 because it's fastq.gz is gzipped content - // per_channel_cell_ch_reads_for_ribosomal_removal - // .dump(tag: 'per_channel_cell_ch_reads_for_ribosomal_removal') + // per_channel_cell_ch_reads_to_translate + // .dump(tag: 'per_channel_cell_ch_reads_to_translate') // .flatten() // .filter{ it -> it.size() > 200 } // each item is just a single file, no need to do it[1] // .map{ it -> tuple(it.simpleName, file(it)) } @@ -960,7 +1044,7 @@ if (params.tenx_tgz || params.bam) { if (params.skip_trimming) { ch_non_bam_reads .concat(per_cell_fastqs_ch) - .set { ch_reads_for_ribosomal_removal } + .set { ch_reads_to_translate } } else { ch_non_bam_reads .mix ( per_cell_fastqs_ch ) @@ -1053,13 +1137,13 @@ if ( have_nucleotide_input ) { ch_reads_trimmed .concat( fastas_ch ) .dump ( tag: 'trimmed_reads__concat_fastas' ) - .set { subsample_ch_reads_for_ribosomal_removal } + .set { subsample_ch_reads_to_translate } } else { // Concatenate trimmed reads with fastas for signature generation - ch_reads_for_ribosomal_removal = ch_reads_trimmed.mix(fastas_ch) + ch_reads_to_translate = ch_reads_trimmed.concat(fastas_ch) } } else { - ch_reads_for_ribosomal_removal = fastas_ch + ch_reads_to_translate = fastas_ch ch_fastp_results = Channel.from(false) } @@ -1069,10 +1153,10 @@ if (params.subsample) { publishDir "${params.outdir}/seqtk/", mode: params.publish_dir_mode input: - set val(id), file(reads) from subsample_ch_reads_for_ribosomal_removal + set val(id), file(reads) from subsample_ch_reads_to_translate output: - set val(id), file("*_${params.subsample}.fastq.gz") into ch_reads_for_ribosomal_removal + set val(id), file("*_${params.subsample}.fastq.gz") into ch_reads_to_translate script: read1 = reads[0] @@ -1087,101 +1171,8 @@ if (params.subsample) { } } -/* - * STEP 2+ - SortMeRNA - remove rRNA sequences on request - */ -if (!params.remove_ribo_rna) { - ch_reads_for_ribosomal_removal - .set { ch_reads_to_translate } - sortmerna_logs = Channel.empty() -} else { - process sortmerna_index { - label 'mid_memory_long' - label 'mid_cpu' - tag "${fasta.baseName}" - - input: - file(fasta) from sortmerna_fasta - - output: - val("${fasta.baseName}") into sortmerna_db_name - file("$fasta") into sortmerna_db_fasta - file("${fasta.baseName}*") into sortmerna_db - - script: - """ - indexdb_rna --ref $fasta,${fasta.baseName} -m 3072 -v - """ - } - - process sortmerna { - label 'mid_memory_long' - label 'mid_cpu' - tag "$name" - publishDir "${params.outdir}/SortMeRNA", mode: "${params.publish_dir_mode}", - saveAs: {filename -> - if (filename.indexOf("_rRNA_report.txt") > 0) "logs/$filename" - else if (params.save_non_rrna_reads) "reads/$filename" - else null - } - - input: - set val(name), file(reads) from ch_reads_for_ribosomal_removal - val(db_name) from sortmerna_db_name.collect() - file(db_fasta) from sortmerna_db_fasta.collect() - file(db) from sortmerna_db.collect() - - output: - set val(name), file("*.fq.gz") into ch_reads_to_translate - file "*_rRNA_report.txt" into sortmerna_logs - - script: - //concatenate reference files: ${db_fasta},${db_name}:${db_fasta},${db_name}:... - def Refs = '' - for (i=0; i single end - if (reads[1] == null) { - """ - gzip -d --force < ${reads} > all-reads.fastq - sortmerna --ref ${Refs} \ - --reads all-reads.fastq \ - --num_alignments 1 \ - -a ${task.cpus} \ - --fastx \ - --aligned rRNA-reads \ - --other non-rRNA-reads \ - --log -v - gzip --force < non-rRNA-reads.fastq > ${name}.fq.gz - mv rRNA-reads.log ${name}_rRNA_report.txt - """ - } else { - """ - gzip -d --force < ${reads[0]} > reads-fw.fq - gzip -d --force < ${reads[1]} > reads-rv.fq - merge-paired-reads.sh reads-fw.fq reads-rv.fq all-reads.fastq - sortmerna --ref ${Refs} \ - --reads all-reads.fastq \ - --num_alignments 1 \ - -a ${task.cpus} \ - --fastx --paired_in \ - --aligned rRNA-reads \ - --other non-rRNA-reads \ - --log -v - unmerge-paired-reads.sh non-rRNA-reads.fastq non-rRNA-reads-fw.fq non-rRNA-reads-rv.fq - gzip < non-rRNA-reads-fw.fq > ${name}-fw.fq.gz - gzip < non-rRNA-reads-rv.fq > ${name}-rv.fq.gz - mv rRNA-reads.log ${name}_rRNA_report.txt - """ - } - } - } - - - - if (params.reference_proteome_fasta){ + if (params.translate_proteome_fasta){ process translate { tag "${sample_id}" label "low_memory_long" @@ -1375,7 +1366,7 @@ if (!have_nucleotide_input) { } -if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ +if (!params.skip_compute && (protein_input || params.translate_proteome_fasta)){ process sourmash_compute_sketch_fastx_peptide { tag "${sig_id}" @@ -1524,7 +1515,6 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ } - ch_sourmash_sketches_merged_to_view .dump( tag: "ch_sourmash_sketches_to_view" ) @@ -1545,6 +1535,274 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ ch_sourmash_sig_describe_merged = Channel.empty() } + +if (!params.skip_remove_constitutive_genes) { + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- REMOVE K-MERS FROM constitutive GENES -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + + + + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- DOWNLOAD NUCLEOTIDE AND PROTEIN SEQS FROM REFSEQ -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* + * STEP 6 - rsync to download refeseq + */ + if (need_refseq_download){ + // No fastas provided for removing constitutive genes + process download_refseq { + tag "${constitutive_refseq_taxonomy}--${refseq_moltype}" + label "process_low" + publishDir "${params.outdir}/reference/ncbi_refseq/", mode: 'copy' + + input: + val refseq_moltype from ch_refseq_moltypes_to_download + + output: + set val(refseq_moltype), file("${constitutive_refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter + + script: + output_fasta = "${constitutive_refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz" + include_fasta = params.test_mini_refseq_download ? "${constitutive_refseq_taxonomy}.1.${refseq_moltype}.f*a.gz" : "*${refseq_moltype}.f*a.gz" + """ + rsync \\ + --prune-empty-dirs \\ + --archive \\ + --verbose \\ + --recursive \\ + --include '${include_fasta}' \\ + --exclude '/*' \\ + rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${constitutive_refseq_taxonomy}/ . + wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER + DATE=\$(date +'%Y-%m-%d') + RELEASE_NUMBER=\$(cat RELEASE_NUMBER) + zcat ${constitutive_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} + """ + } + + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- REMOVE K-MERS FROM constitutive GENES -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* + * STEP 7 - Get only constitutive genes from + */ + // Keep genes whose names match constitutive gene regular expression pattern + process extract_fasta_constitutive { + tag "${fasta.baseName}" + label "process_low" + publishDir "${params.outdir}/reference/constitutive_genes/", mode: 'copy' + + input: + set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter + + output: + set val(refseq_moltype), file(output_fasta_gz) into ch_constitutive_fasta, ch_constitutive_fasta_to_view + + script: + output_fasta = "${fasta.baseName}__only_constitutive_genes.fa" + output_fasta_gz = "${fasta.baseName}__only_constitutive_genes.fa.gz" + """ + filter_fasta_regex.py \\ + --input-fasta ${fasta} \\ + --output-fasta ${output_fasta} \\ + --regex-pattern '${params.constitutive_gene_regex}' + gzip -c ${output_fasta} > ${output_fasta_gz} + """ + } + + ch_constitutive_fasta_to_view + .dump( tag: 'ch_constitutive_fasta' ) + } + + if (!have_constitutive_sigs) { + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- COMPUTE constitutive GENE K-MER SIGNATURE -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* + * STEP 8 - Compute constitutive Gene K-mer Signature + */ + // No fastas provided for removing constitutive genes + process compute_constitutive_kmer_sig { + tag "${fasta.baseName}" + label "process_low" + publishDir "${params.outdir}/reference/constitutive_genes/", mode: 'copy' + + input: + val track_abundance + val sketch_value_parsed + val sketch_style_parsed + set val(refseq_moltype), file(fasta) from ch_constitutive_fasta + + output: + set val(sourmash_moltypes), file(sig) into ch_constitutive_sig + + script: + is_protein = refseq_moltype == "protein" + sourmash_moltype = is_protein ? "protein,dayhoff" : 'dna' + sourmash_moltypes = tuple(sourmash_moltype.split(",")) + sketch_id = make_sketch_id( + sourmash_moltype, + params.ksizes, + sketch_value_parsed[0], + track_abundance, + sketch_style_parsed[0] + ) + + sketch_value_flag = make_sketch_value_flag( + sketch_style_parsed[0], + sketch_value_parsed[0] + ) + moltype_flags = is_protein ? '--protein --dayhoff --input-is-protein' : '--dna' + track_abundance_flag = track_abundance ? '--track-abundance' : '' + sig_id = "${fasta.baseName}__${sketch_id}" + sig = "${sig_id}.sig" + csv = "${sig_id}.csv" + """ + sourmash compute \\ + ${sketch_value_flag} \\ + --ksizes ${params.ksizes} \\ + ${moltype_flags} \\ + ${track_abundance_flag} \\ + --output ${sig} \\ + --name '${fasta.baseName}' \\ + ${fasta} + sourmash sig describe --csv ${csv} ${sig} + """ + } + } + + + ch_sourmash_sketches_merged + // index 2: moltypes + // index 4: signature + .map { tuple( tuple(it[2].split(",")), it[4] ) } + .transpose() + .dump( tag: 'ch_sourmash_sketches_moltype_to_sig' ) + .groupTuple( by: 0 ) + .dump( tag: 'ch_sourmash_sketches_moltype_to_sig__groupTuple' ) + .set { ch_sourmash_sketches_moltype_to_sigs } + + ch_constitutive_sig + .dump( tag: 'ch_constitutive_sig' ) + .transpose() + .dump( tag: 'ch_constitutive_sig__transposed' ) + .combine( ch_sourmash_params_for_subtract, by: 0) + .dump( tag: 'ch_constitutive_sig__transposed__combined' ) + .combine ( ch_sourmash_sketches_moltype_to_sigs, by: 0 ) + .dump( tag: 'ch_constitutive_sig__transposed__combined_joined' ) + .into { ch_subtract_params_with_sigs; ch_subtract_params_to_sigs_for_siglist } + + ch_subtract_params_to_sigs_for_siglist + .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist' ) + .transpose() + .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose') + .collectFile() { it -> + [ "${it[0]}__${it[2]}.txt", "${it[3].getFileName()}\n"] + } + .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose__collectfile' ) + .map { [ tuple( it.baseName.split('__') ), it] } + .map { [ it[0][0], it[0][1], it[1] ] } + // .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose__collectfile__map' ) + // .transpose() + .dump ( tag: 'ch_subtract_params_with_siglist' ) + .set { ch_subtract_params_with_siglist } + + ch_subtract_params_with_sigs + // Reorder so molecule (it[0]) and ksize (it[2]) are first + .map{ [it[0], it[2], it[1], it[3]] } + .dump ( tag: 'ch_subtract_params_with_sigs__map' ) + .combine( ch_subtract_params_with_siglist, by: [0, 1] ) + .dump( tag: 'ch_sigs_with_constitutive_sig_to_subtract' ) + .set { ch_sigs_with_constitutive_sig_to_subtract } + + + // /////////////////////////////////////////////////////////////////////////////// + // /////////////////////////////////////////////////////////////////////////////// + // /* -- -- */ + // /* -- REMOVE K-MERS FROM constitutive GENES -- */ + // /* -- -- */ + // /////////////////////////////////////////////////////////////////////////////// + // /////////////////////////////////////////////////////////////////////////////// + // /* + // * STEP 9 - Remove constitutive gene k-mers from single cells + // */ + process subtract_constitutive_kmers { + tag "${subtract_id}" + label "process_medium" + publishDir "${params.outdir}/sketches_subtract_constitutive_kmers/${subtract_id}", mode: 'copy' + + input: + val sketch_value_parsed + val sketch_style_parsed + set val(molecule), val(ksize), file(constitutive_sig), file(sigs), file(siglist) from ch_sigs_with_constitutive_sig_to_subtract + + output: + set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_constitutive_removed + + script: + subtract_id = "${molecule}__k-${ksize}" + sketch_value_flag = make_sketch_value_flag( + sketch_style_parsed[0], + sketch_value_parsed[0] + ) + track_abundance_flag = track_abundance ? '--track-abundance' : '' + + """ + subtract \\ + ${track_abundance_flag} \\ + ${sketch_value_flag} \\ + --ksize ${ksize} \\ + --encoding ${molecule} \\ + --output subtracted/ \\ + ${constitutive_sig} \\ + ${siglist} + """ + } + + ch_sigs_constitutive_removed + // .groupTuple( by: [0, 1] ) + .transpose( by: 2 ) + .set{ ch_sourmash_sketches_to_compare } + +} else { + ch_sourmash_sketches_merged + .map { [tuple(it[2].split(",")), it[4]] } + .dump(tag: 'ch_sourmash_sketches_merged__map_split' ) + .transpose() + .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' ) + // Perform cartesian product on the molecules with compare params + .combine( ch_sourmash_params_for_compare, by: 0) + .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' ) + // .groupTuple(by: [0, 2]) + .dump(tag: 'ch_sourmash_sketches_to_compare' ) + // Reorder so signature files are last + // moltype, ksize, signature file + .map { [it[0], it[2], it[1]] } + .set { ch_sourmash_sketches_to_compare } + + ch_sourmash_sig_describe_merged = Channel.empty() +} + + + + if (params.split_kmer){ process ska_compare_sketches { tag "${sketch_id}" @@ -1566,57 +1824,23 @@ if (params.split_kmer){ } // If skip_compute is true, skip compare must be specified as true as well if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { - // // Combine peptide and nucleotide sketches - // sourmash_sketches_nucleotide - // .collect() - // // Set as a list so that combine does cartesian product of all signatures - // .map { it -> [it] } - // .combine( ch_ksizes_for_compare_nucleotide ) - // .dump( tag: 'sourmash_sketches_nucleotide__ksizes' ) - // .map { x -> [x[0], x[1], 'dna'] } - // .dump( tag: 'sourmash_sketches_nucleotide__ksizes__molecules' ) - // .set { sourmash_sketches_nucleotide_for_compare } - - // sourmash_sketches_peptide - // .collect() - // // Set as a list so that combine does cartesian product of all signatures - // .map { it -> [it] } - // .combine( ch_ksizes_for_compare_petide ) - // .dump( tag: 'sourmash_sketches_peptide__ksizes' ) - // .combine( ch_peptide_molecules ) - // .dump( tag: 'sourmash_sketches_peptide__ksizes__molecules' ) - // .set { sourmash_sketches_peptide_for_compare } - - // sourmash_sketches_peptide_for_compare - // .mix ( sourmash_sketches_nucleotide_for_compare ) - // .set { ch_sourmash_sketches_to_compare } - - // ch_sourmash_sketches_to_compare = Channel.empty() - - ch_peptide_molecules_for_compare - .combine( ch_ksizes_for_compare_peptide ) - .set { ch_sourmash_compare_params_peptide } - - Channel.from("dna") - .combine( ch_ksizes_for_compare_nucleotide ) - .mix ( ch_sourmash_compare_params_peptide ) - .set { ch_sourmash_compare_params_both } + ch_sourmash_compare_sketch_params_to_sketches = Channel.create() - ch_sourmash_sketches_merged - // Drop first index (index 0) which is the cell id - // Drop the second index (index 1) which is the sketch id - // Keep only moltype - // Drop ksize - .map { [tuple(it[2].split(",")), it[4]] } - .dump(tag: 'ch_sourmash_sketches_merged__map_split' ) - .transpose() - .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' ) - // Perform cartesian product on the molecules with compare params - .combine( ch_sourmash_compare_params_both, by: 0) - .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' ) - .groupTuple(by: [0, 2]) - .dump(tag: 'ch_sourmash_sketches_to_compare' ) - .set { ch_sourmash_sketches_to_compare } + ch_sourmash_sketches_to_compare + .tap ( ch_sourmash_compare_sketch_params_to_sketches ) + .dump( tag: 'ch_compare_params_to_sigs_for_siglist__transpose' ) + .collectFile() { it -> + [ "${it[0]}__${it[1]}.txt", "${it[2].getFileName()}\n"] + } + .dump ( tag: 'ch_compare_params_to_sigs_for_siglist__transpose__collectfile' ) + .map { [ tuple( it.baseName.split('__') ), it] } + .map { [ it[0][0], it[0][1], it[1] ] } + .dump ( tag: 'ch_compare_params_with_siglist' ) + .combine( ch_sourmash_compare_sketch_params_to_sketches, by: [0, 1] ) + .dump( tag: 'ch_compare_params_with_siglist__add_sketches' ) + .groupTuple( by: [0, 1, 2] ) + .dump ( tag: 'ch_compare_params_with_siglist__add_sketches__groupTuple' ) + .set { ch_sourmash_params_to_siglist_sketches } process sourmash_compare_sketches { // Combine peptide and nucleotide sketches @@ -1624,8 +1848,8 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { publishDir "${params.outdir}/compare_sketches", mode: 'copy' input: - // Weird order but that's how it shakes out with the groupTuple - set val(molecule), file("*.sig"), val(ksize) from ch_sourmash_sketches_to_compare + // file(sigs) is necessary to stage all the signature files present in file(siglist) + set val(molecule), val(ksize), file(siglist), file(sigs) from ch_sourmash_params_to_siglist_sketches output: file(csv) @@ -1640,7 +1864,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { --${molecule} \\ --csv ${csv} \\ ${processes} \\ - --traverse-directory . + --from-file ${siglist} # Use --traverse-directory instead of all the files explicitly to avoid # "too many arguments" error for bash when there are lots of samples """ @@ -1662,7 +1886,6 @@ if (!params.skip_multiqc){ file ("sourmash_describe_peptides/") from ch_sourmash_sig_describe_peptides.collect().ifEmpty([]) file ("sourmash_describe_nucleotides/") from ch_sourmash_sig_describe_nucleotides.collect().ifEmpty([]) file ('fastp/*') from ch_fastp_results.collect().ifEmpty([]) - file ('sortmerna/*') from sortmerna_logs.collect().ifEmpty([]) file ('software_versions/*') from ch_software_versions_yaml.collect() file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") diff --git a/nextflow.config b/nextflow.config index 589d186f..7bdc2902 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,7 +16,7 @@ params { fastas = false protein_fastas = false sra = false - + bam = false input = false // Parsing 10x bam files @@ -34,7 +34,7 @@ params { // Number of hashes from each sample sketch_num_hashes = false sketch_num_hashes_log2 = false - sketch_scaled = false + sketch_scaled = 10 sketch_scaled_log2 = false skip_sig_merge = false @@ -44,24 +44,30 @@ params { // Computing sketches skip_compute = false + // Skip trimming of adapters and poly-X sequences skip_trimming = false // translate options translate_peptide_ksize = 8 translate_peptide_molecule = 'protein' translate_jaccard_threshold = 0.05 - reference_proteome_fasta = false + translate_proteome_fasta = false bloomfilter_tablesize = '1e8' // Saving the translate results for each dataset makes it take extra long // Recommended for debugging purposes only save_translate_csv = false save_translate_json = false - - // Ribosomal RNA removal - remove_ribo_rna = false - save_non_rrna_reads = false - rrna_database_manifest = "${baseDir}/assets/rrna-db-defaults.txt" + // constitutive gene k-mer removal + skip_remove_constitutive_genes = false + constitutive_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH" + constitutive_refseq_taxonomy = 'vertebrate_mammalian' + // For testing purposes --> use a small refseq dataset + test_mini_refseq_download = false + constitutive_protein_fasta = false + constitutive_rna_fasta = false + constitutive_protein_sig = false + constitutive_rna_sig = false // ska options split_kmer = false @@ -71,7 +77,7 @@ params { save_fastas = "fastas" tenx_min_umi_per_cell = '0' write_barcode_meta_csv = false - bam = false + // 10x optional input parameters set using the below pattern // https://github.com/nextflow-io/patterns/blob/master/docs/optional-input.adoc @@ -150,6 +156,9 @@ profiles { test_ska { includeConfig 'conf/test_ska.config' } test_bam { includeConfig 'conf/test_bam.config' } test_fastas { includeConfig 'conf/test_fastas.config' } + test_constitutive_from_download_refseq { includeConfig 'conf/test_constitutive_from_download_refseq.config' } + test_constitutive_from_fasta { includeConfig 'conf/test_constitutive_from_fasta.config' } + test_constitutive_from_sig { includeConfig 'conf/test_constitutive_from_sig.config' } test_protein_fastas { includeConfig 'conf/test_protein_fastas.config' } test_remove_ribo { includeConfig 'conf/test_remove_ribo.config' } test_sig_merge { includeConfig 'conf/test_sig_merge.config' } diff --git a/scratch.nf b/scratch.nf new file mode 100644 index 00000000..617d336d --- /dev/null +++ b/scratch.nf @@ -0,0 +1,10 @@ +housekeeping_protein_fasta = false +housekeeping_rna_fasta = true + +ch_refseq_moltype_to_fasta = Channel.from(["protein", housekeeping_protein_fasta], ["rna", housekeeping_rna_fasta]) +ch_refseq_moltype_to_fasta + // filter if the second item, the fasta is false + .filter{ !it[1] } + // Take only the first item, the molecule type + .map{ it[0] } + .println() diff --git a/siglist.txt b/siglist.txt new file mode 100644 index 00000000..43fc713a --- /dev/null +++ b/siglist.txt @@ -0,0 +1,4 @@ +SRR4238351__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig +SRR4238355__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig +SRR4050380__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig +SRR4050379__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig