diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0a26f0ed..cdf23457 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -61,11 +61,13 @@ jobs:
NXF_VER: '20.07.1'
NXF_ANSI_LOG: false
strategy:
+ fail-fast: false
matrix:
profile_flags:
- "test --sketch_scaled false --sketch_scaled_log2 2"
- "test --sketch_scaled false --sketch_num_hashes 20"
- "test --sketch_scaled false --sketch_num_hashes_log2 20"
+ - "test_bam"
- "test_bam --barcodes_file false --rename_10x_barcodes false --save_fastas false --write_barcodes_meta_csv false"
- "test_bam --rename_10x_barcodes false --write_barcodes_meta_csv false"
- "test_bam --skip_sig_merge"
@@ -73,6 +75,9 @@ jobs:
- "test_bam --barcodes_file false --rename_10x_barcodes false"
- "test_bam --rename_10x_barcodes false"
- "test_fastas"
+ - "test_constitutive_from_download_refseq"
+ - "test_constitutive_from_fasta"
+ - "test_constitutive_from_sig"
- "test_protein_fastas"
- "test_remove_ribo"
- "test_sig_merge"
diff --git a/Dockerfile b/Dockerfile
index 46764af7..a0cb55b8 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,7 +4,8 @@ LABEL authors="Olga Botvinnik" \
# Install the conda environment
COPY environment.yml /
-RUN conda env create --quiet -f /environment.yml && conda clean -a
+RUN conda install -c conda-forge mamba
+RUN mamba env create -f /environment.yml && mamba clean -a
# Add conda installation dir to PATH (instead of doing 'conda activate')
ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH
@@ -12,6 +13,16 @@ ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH
# Dump the details of the installed packages to a file for posterity
RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0dev.yml
+# Install super fast rust code to remove nuisance hashes (e.g. ribosomal) from signatures
+RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git
+# Soft link all conda C-related libraries to their non-prefixed name
+# for rust to be able to build the C libraries
+RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); \
+ do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g; ln -s $f $g ; done
+RUN cd 2021-01-27-olga-remove-protein && cargo build --release
+# Add "subtract" command to path
+ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH
+
# Instruct R processes to use these empty files instead of clashing with a local version
RUN touch .Rprofile
RUN touch .Renviron
diff --git a/bin/filter_fasta_regex.py b/bin/filter_fasta_regex.py
new file mode 100755
index 00000000..ebded98a
--- /dev/null
+++ b/bin/filter_fasta_regex.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+import argparse
+import re
+
+
+import screed
+
+
+def write_records_to_fasta(records, fasta):
+ with open(fasta, "w") as f:
+ for record in records:
+ f.write(f'>{record["name"]}\n{record["sequence"]}\n')
+
+
+def filter_records(fasta, pattern):
+ filtered_records = []
+ with screed.open(fasta) as records:
+ for record in records:
+ name = record["name"]
+ if re.findall(pattern, name, flags=re.I):
+ filtered_records.append(record)
+ return filtered_records
+
+
+def filter_fasta_with_regex(fasta_to_filter, out_fasta, regex):
+ record_subset = filter_records(fasta_to_filter, regex)
+ write_records_to_fasta(record_subset, out_fasta)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="""Extract sequences whose names match a pattern"""
+ )
+ parser.add_argument("--input-fasta", type=str, help="Sequence file to filter")
+ parser.add_argument("--output-fasta", type=str, help="File to write")
+ parser.add_argument(
+ "--regex-pattern",
+ type=str,
+ help="Regular expression pattern to match for the names of seuqences in the file",
+ )
+ args = parser.parse_args()
+
+ filter_fasta_with_regex(args.input_fasta, args.output_fasta, args.regex_pattern)
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
index 551861e1..80f39c09 100755
--- a/bin/scrape_software_versions.py
+++ b/bin/scrape_software_versions.py
@@ -14,8 +14,10 @@
"SKA": ["v_ska.txt", r"SKA Version: (\S+)"],
"htslib": ["v_samtools.txt", r"htslib (\S+)"],
"Sourmash": ["v_sourmash.txt", r"sourmash (\S+)"],
- "SortMeRNA": ["v_sortmerna.txt", r"SortMeRNA version (\S+),"],
+ "Rsync": ["v_rsync.txt", r"rsync version (\S+)"],
+ "Rsync (Protocol)": ["v_rsync.txt", r"protocol version (\S+)"],
"orpheum": ["v_orpheum.txt", r"Version: (\S+)"],
+ "Python": ["v_python.txt", r"Python (\S+)"],
}
results = OrderedDict()
results["nf-core/kmermaid"] = 'N/A'
@@ -25,11 +27,13 @@
results["bam2fasta"] = 'N/A'
results["fastp"] = 'N/A'
results["htslib"] = 'N/A'
+results["orpheum"] = 'N/A'
+results["Python"] = 'N/A'
+results["Rsync"] = 'N/A'
+results["Rsync (Protocol)"] = 'N/A'
results["Samtools"] = 'N/A'
results["SKA"] = 'N/A'
results["Sourmash"] = 'N/A'
-results["SortMeRNA"] = 'N/A'
-results["orpheum"] = 'N/A'
# Search each file using its regex
for k, v in regexes.items():
diff --git a/bin/validate_sketch_value.py b/bin/validate_sketch_value.py
index d96cff7f..497d92bd 100755
--- a/bin/validate_sketch_value.py
+++ b/bin/validate_sketch_value.py
@@ -20,7 +20,7 @@ def get_sketch_value(value, value_log2):
if "," in value:
logger.exception(
f"Can only provide a single number to --sketch_num_hashes or"
- f" --sketch_scaled. Provided '{value}"
+ f" --sketch_scaled. Provided '{value}'"
)
sketch_value = int(value)
else:
diff --git a/conf/base.config b/conf/base.config
index 07a2aa3b..01e0ffa3 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -54,6 +54,7 @@ process {
withName: 'multiqc|get_software_versions' {
memory = { check_max( 2.GB * task.attempt, 'memory' ) }
+ errorStrategy = "ignore"
cache = false
}
withName: 'sourmash_compute_sketch_fastx_nucleotide|sourmash_compute_sketch_fastx_peptide' {
diff --git a/conf/test.config b/conf/test.config
index 4d4dced7..cf19689a 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -17,7 +17,6 @@ params {
// Input data
// samples = 'testing/samples.csv'
// fastas = 'testing/fastas/*.fasta'
- sketch_scaled = 2
molecules = 'dna,protein,dayhoff'
// read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
// sra = "SRP016501"
@@ -29,4 +28,8 @@ params {
['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
]
+ // Remove constitutively expressed genes
+ test_mini_refseq_download = true
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
}
diff --git a/conf/test_bam.config b/conf/test_bam.config
index 8bcdb775..07579d50 100644
--- a/conf/test_bam.config
+++ b/conf/test_bam.config
@@ -19,7 +19,6 @@ params {
'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
// Sketch Parameters
- sketch_scaled = 2
molecules = 'dna,protein,dayhoff'
read_pairs = false
save_fastas = "fastas"
@@ -28,4 +27,6 @@ params {
// For bam, each fasta record represents each barcode and each should have a signature
// they should not be merged, For computation on bam file using sourmash, please set true for the below flag
tenx_min_umi_per_cell = 2
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
}
diff --git a/conf/test_constitutive_from_download_refseq.config b/conf/test_constitutive_from_download_refseq.config
new file mode 100644
index 00000000..886a8424
--- /dev/null
+++ b/conf/test_constitutive_from_download_refseq.config
@@ -0,0 +1,32 @@
+/*
+ * -------------------------------------------------
+ * Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ * nextflow run nf-core/kmermaid -profile test
+ */
+
+params {
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Minimal test dataset to check pipeline function'
+ // Limit resources so that this can run on Travis
+ max_cpus = 2
+ max_memory = 6.GB
+ max_time = 48.h
+
+ // Input data
+ input_paths = [
+ ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
+ 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']],
+ ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz',
+ 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']],
+ ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
+ ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
+ ]
+
+ // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/
+ // Protein fasta is 453 B
+ refseq_taxonomy = 'vertebrate_mammalian'
+ test_mini_refseq_download = true
+}
diff --git a/conf/test_constitutive_from_fasta.config b/conf/test_constitutive_from_fasta.config
new file mode 100644
index 00000000..ea757073
--- /dev/null
+++ b/conf/test_constitutive_from_fasta.config
@@ -0,0 +1,32 @@
+/*
+ * -------------------------------------------------
+ * Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ * nextflow run nf-core/kmermaid -profile test
+ */
+
+params {
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Minimal test dataset to check pipeline function'
+ // Limit resources so that this can run on Travis
+ max_cpus = 2
+ max_memory = 6.GB
+ max_time = 48.h
+
+ // Input data
+ input_paths = [
+ ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
+ 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']],
+ ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz',
+ 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']],
+ ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
+ ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
+ ]
+ constitutive_protein_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa.gz"
+ constitutive_rna_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa.gz"
+
+ translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa'
+ bloomfilter_tablesize = '1e6'
+}
diff --git a/conf/test_constitutive_from_sig.config b/conf/test_constitutive_from_sig.config
new file mode 100644
index 00000000..0e2bad4d
--- /dev/null
+++ b/conf/test_constitutive_from_sig.config
@@ -0,0 +1,29 @@
+/*
+ * -------------------------------------------------
+ * Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ * nextflow run nf-core/kmermaid -profile test
+ */
+
+params {
+ config_profile_name = 'Test profile'
+ config_profile_description = 'Minimal test dataset to check pipeline function'
+ // Limit resources so that this can run on Travis
+ max_cpus = 2
+ max_memory = 6.GB
+ max_time = 48.h
+
+ // Input data
+ input_paths = [
+ ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz',
+ 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']],
+ ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz',
+ 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']],
+ ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
+ ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
+ ]
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+}
diff --git a/conf/test_fastas.config b/conf/test_fastas.config
index a6509d4e..b439e03c 100644
--- a/conf/test_fastas.config
+++ b/conf/test_fastas.config
@@ -17,7 +17,6 @@ params {
// Input data
// samples = 'testing/samples.csv'
// fastas = 'testing/fastas/*.fasta'
- sketch_scaled = 2
molecules = 'dna,protein,dayhoff'
// read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
// sra = "SRP016501"
@@ -26,4 +25,6 @@ params {
['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']],
]
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
}
diff --git a/conf/test_full.config b/conf/test_full.config
index 5dfaeafb..ac6e6677 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -12,10 +12,12 @@ params {
config_profile_description = 'Full test dataset to check pipeline function'
// Input data for full size test
- sketch_scaled = 2
molecules = 'dna,protein,dayhoff'
input_paths = [
['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']],
['K562', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_2.fastq.gz', 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_2.fastq.gz']]
]
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+
}
diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config
index ea22bcb6..91a2325d 100644
--- a/conf/test_protein_fastas.config
+++ b/conf/test_protein_fastas.config
@@ -26,8 +26,8 @@ params {
['https://github.com/czbiohub/test-datasets/raw/predictorthologs/testdata/bonobo_liver_ptprc__molecule-dayhoff__coding_reads_peptides.fasta']]]
// Sketch Parameters
- sketch_scaled = 2
molecules = 'protein,dayhoff,hp'
read_pairs = false
-
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
}
diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config
index 8aa689ac..72c2710b 100644
--- a/conf/test_remove_ribo.config
+++ b/conf/test_remove_ribo.config
@@ -17,7 +17,6 @@ params {
// Input data
// samples = 'testing/samples.csv'
// fastas = 'testing/fastas/*.fasta'
- sketch_scaled = 2
molecules = 'dna,protein,dayhoff'
// read_pairs = 'testing/fastqs/*{1,2}.fastq.gz'
// sra = "SRP016501"
@@ -31,4 +30,6 @@ params {
['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']],
['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']],
]
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
}
diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config
index 21a27939..ad821450 100644
--- a/conf/test_sig_merge.config
+++ b/conf/test_sig_merge.config
@@ -18,7 +18,6 @@ params {
bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
// Sketch Parameters
- sketch_scaled = 2
molecules = 'dna,protein,dayhoff'
read_pairs = false
save_fastas = "fastas"
@@ -29,4 +28,6 @@ params {
reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
bloomfilter_tablesize = '1e6'
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
}
diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config
index 39b9b2f0..10ae33ab 100644
--- a/conf/test_tenx_tgz.config
+++ b/conf/test_tenx_tgz.config
@@ -20,14 +20,13 @@ params {
'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid-unaligned-tgz-v3/testdata/mouse_brown_fat_ptprc_plus_unaligned.tgz'
]
// Sketch Parameters
- sketch_scaled = 2
molecules = 'dna,protein,dayhoff'
read_pairs = false
save_fastas = "fastas"
save_intermediate_files = "/tmp/"
write_barcode_meta_csv = "metadata.csv"
// For bam, each fasta record represents each barcode and each should have a signature
- // they should not be merged, For computation on bam file using sourmash, please set true for the below flag
tenx_min_umi_per_cell = 10
- shard_size = 350
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
}
diff --git a/conf/test_translate.config b/conf/test_translate.config
index c6e488a5..c4a7bccd 100644
--- a/conf/test_translate.config
+++ b/conf/test_translate.config
@@ -17,12 +17,15 @@ params {
// Input data
fastas = "https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_transcripts.subsample5.fa"
// Sketch Parameters
- sketch_scaled = 2
molecules = 'dna,protein,dayhoff'
read_pairs = false
- reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa'
+ translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa'
bloomfilter_tablesize = '1e8'
translate_peptide_ksize = '11'
translate_peptide_molecule = 'dayhoff'
+
+ // Remove constitutively expressed genes
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
}
diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config
index 15365382..4f8a5487 100644
--- a/conf/test_translate_bam.config
+++ b/conf/test_translate_bam.config
@@ -18,17 +18,18 @@ params {
bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam',
'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam']
// Sketch Parameters
- sketch_scaled = 2
molecules = 'dna,protein,dayhoff'
read_pairs = false
save_fastas = "fastas"
write_barcode_meta_csv = "metadata.csv"
// For bam, each fasta record represents each barcode and each should have a signature
// they should not be merged, For computation on bam file using sourmash, please set true for the below flag
- tenx_min_umi_per_cell = 5
+ tenx_min_umi_per_cell = 2
- reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
+ translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa'
bloomfilter_tablesize = '1e6'
translate_peptide_ksize = '11'
translate_peptide_molecule = 'dayhoff'
+ constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig"
+ constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig"
}
diff --git a/environment.yml b/environment.yml
index f056cbf3..093d9644 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,6 +7,7 @@ channels:
- defaults
- anaconda
dependencies:
+ - conda-forge::cmake=3.19.6
- conda-forge::python=3.7.3
- conda-forge::markdown=3.1.1
- conda-forge::pymdown-extensions=6.0
@@ -14,6 +15,7 @@ dependencies:
- conda-forge::tqdm=4.43.0
- conda-forge::gxx_linux-64=7.3.0
- conda-forge::s3fs=0.4.2
+ - conda-forge::rust=1.48.0
- bioconda::sourmash=3.5.0
- bioconda::samtools=1.10
- bioconda::screed=1.0.4
@@ -33,8 +35,8 @@ dependencies:
- ska=1.0
- sphinx=2.3.1
- jupyter=1.0.0
- - sortmerna=2.1b # for metatranscriptomics
- ripgrep=12.1.1
+ - rsync=3.2.3
- pip:
- bam2fasta==1.0.8
- orpheum==1.0.4
\ No newline at end of file
diff --git a/environment_osx.yml b/environment_osx.yml
index ad794f08..07d462c2 100644
--- a/environment_osx.yml
+++ b/environment_osx.yml
@@ -33,8 +33,9 @@ dependencies:
- ska=1.0
- sphinx=2.3.1
- jupyter=1.0.0
- - sortmerna=2.1b # for metatranscriptomics
- ripgrep=12.1.1
+ - conda-forge::rust=1.48.0
+ - rsync=3.2.3
- pip:
- bam2fasta==1.0.8
- - orpheum==1.0.4
\ No newline at end of file
+ - orpheum==1.0.4
diff --git a/main.nf b/main.nf
index 94756edf..0d4890a9 100644
--- a/main.nf
+++ b/main.nf
@@ -120,7 +120,7 @@ def helpMessage() {
to new name, e.g. with channel or cell annotation label
Translate RNA-seq reads into protein-coding sequences options:
- --reference_proteome_fasta Path to a well-curated fasta file of protein sequences. Used to filter for coding reads
+ --translate_proteome_fasta Path to a well-curated fasta file of protein sequences. Used to filter for coding reads
--translate_peptide_ksize K-mer size to use for translating RNA into protein.
Default: 9, which is good for 'protein'. If using dayhoff, suggest 15
--translate_peptide_molecule Which molecular encoding to use for translating. Default: "protein"
@@ -324,10 +324,10 @@ if (params.protein_fastas){
ch_protein_fastas = Channel.empty()
}
-if (params.reference_proteome_fasta) {
-Channel.fromPath(params.reference_proteome_fasta, checkIfExists: true)
- .ifEmpty { exit 1, "Reference proteome file not found: ${params.reference_proteome_fasta}" }
- .set{ ch_reference_proteome_fasta }
+if (params.translate_proteome_fasta) {
+Channel.fromPath(params.translate_proteome_fasta, checkIfExists: true)
+ .ifEmpty { exit 1, "Reference proteome file not found: ${params.translate_proteome_fasta}" }
+ .set{ ch_translate_proteome_fasta }
}
////////////////////////////////////////////////////
@@ -388,12 +388,12 @@ if (!protein_input) {
if (params.subsample && params.skip_trimming ) {
subsample_reads_ch_unchecked
.ifEmpty{ exit 1, "No reads provided! Check read input files" }
- .set { subsample_ch_reads_for_ribosomal_removal }
+ .set { subsample_ch_reads_to_translate }
}
if (params.skip_trimming && !(params.bam || params.tenx_tgz)) {
reads_ch_unchecked
.ifEmpty{ exit 1, "No reads provided! Check read input files" }
- .set { ch_reads_for_ribosomal_removal }
+ .set { ch_reads_to_translate }
ch_read_files_trimming_to_check_size = Channel.empty()
} else if (params.bam || params.tenx_tgz) {
ch_non_bam_reads_unchecked
@@ -409,11 +409,11 @@ if (!protein_input) {
// Since there exists protein input, don't check if these are empty
if (params.subsample) {
subsample_reads_ch_unchecked
- .set { subsample_ch_reads_for_ribosomal_removal }
+ .set { subsample_ch_reads_to_translate }
}
if (params.skip_trimming) {
reads_ch_unchecked
- .set { ch_reads_for_ribosomal_removal }
+ .set { ch_reads_to_translate }
ch_read_files_trimming_to_check_size = Channel.empty()
} else if (!have_nucleotide_fasta_input) {
ch_read_files_trimming_unchecked
@@ -432,15 +432,6 @@ if (params.split_kmer){
params.ksizes = '21,27,33,51'
}
-// Get rRNA databases
-// Default is set to bundled DB list in `assets/rrna-db-defaults.txt`
-
-rRNA_database = file(params.rrna_database_manifest)
-if (rRNA_database.isEmpty()) {exit 1, "File ${rRNA_database.getName()} is empty!"}
-Channel
- .from( rRNA_database.readLines() )
- .map { row -> file(row) }
- .set { sortmerna_fasta }
// --- Parse Translate parameters ---
save_translate_csv = params.save_translate_csv
@@ -450,18 +441,35 @@ save_translate_json = params.save_translate_json
// --- Parse the Sourmash parameters ----
ksizes = params.ksizes?.toString().tokenize(',')
Channel.from(params.ksizes?.toString().tokenize(','))
- .into { ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide }
+ .into { ch_ksizes_for_nucleotide; ch_ksizes_for_peptide; ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide }
molecules = params.molecules?.toString().tokenize(',')
+nucleotide_molecules = molecules.findAll { it == "dna" }
peptide_molecules = molecules.findAll { it != "dna" }
+// have_protein_input = params.translate_proteome_fasta || params.protein_fastas || protein_input
+// peptide_molecules =
peptide_molecules_comma_separated = peptide_molecules.join(",")
peptide_molecule_flags = peptide_molecules.collect { it -> "--${it}" }.join ( " " )
Channel.from( molecules )
.set { ch_molecules }
+Channel.from( nucleotide_molecules )
+ .into { ch_nucleotide_molecules; ch_nucleotide_molecules_for_subtract; ch_nucleotide_molecules_for_compare }
+
Channel.from( peptide_molecules )
- .into { ch_peptide_molecules; ch_peptide_molecules_for_compare }
+ .into { ch_peptide_molecules; ch_peptide_molecules_for_subtract; ch_peptide_molecules_for_compare }
+
+
+ch_peptide_molecules
+ .combine( ch_ksizes_for_peptide )
+ .set { ch_sourmash_params_peptide }
+
+ch_nucleotide_molecules
+ .combine( ch_ksizes_for_nucleotide )
+ .mix ( ch_sourmash_params_peptide )
+ .dump ( tag: 'ch_sourmash_params' )
+ .into { ch_sourmash_params_for_compare ; ch_sourmash_params_for_subtract }
// Parse sketch value and style parameters
sketch_num_hashes = params.sketch_num_hashes
@@ -531,6 +539,75 @@ else {
barcode_metadata_folder = "barcode_metadata"
}
+
+//////////////////////////////////////////////////////////
+/* -- Parse constitutive K-mer removal parameters -- */
+/////////////////////////////////////////////////////////
+constitutive_protein_fasta = params.constitutive_protein_fasta
+constitutive_rna_fasta = params.constitutive_rna_fasta
+
+constitutive_protein_sig = params.constitutive_protein_sig
+constitutive_rna_sig = params.constitutive_rna_sig
+
+have_constitutive_fastas = constitutive_protein_fasta && constitutive_rna_fasta
+have_constitutive_sigs = constitutive_protein_sig && constitutive_rna_sig
+need_refseq_download = (!have_constitutive_fastas) && (!have_constitutive_sigs)
+
+if (have_constitutive_fastas) {
+ Channel.from(
+ ["protein", file(constitutive_protein_fasta)],
+ ["rna", file(constitutive_rna_fasta)])
+ .into { ch_constitutive_fasta; ch_refseq_moltype_to_fasta }
+
+ ch_refseq_moltype_to_fasta
+ // Check if protein molecules were even specified
+ .filter{
+ it[0] == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0
+ }
+ // Take only the first item, the molecule type
+ .map{ it[0] }
+ .set{ ch_refseq_moltypes_to_download }
+} else {
+ // Don't look at the fastas, only check the parsed molecule types
+ Channel.from(['protein', 'rna'])
+ .filter{
+ it == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0
+ }
+ .set{ ch_refseq_moltypes_to_download }
+}
+
+if (have_constitutive_sigs) {
+ // Use sourmash moltypes of "protein,dayhoff" instead of the original protein
+ // as used for the fastas as that's what matches the sourmash outputs
+ Channel.from(
+ ["protein,dayhoff", file(constitutive_protein_sig)],
+ ["dna", file(constitutive_rna_sig)])
+ .set { ch_constitutive_sig }
+
+ // Refseq molecule types are "protein" and "rna"
+ Channel.from(
+ ["protein", file(constitutive_protein_sig)],
+ ["rna", file(constitutive_rna_sig)])
+ .set { ch_refseq_moltype_to_sig }
+
+ ch_refseq_moltype_to_sig
+ // Check if protein molecules were even specified
+ .filter{
+ it[0]== "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0
+ }
+ // Take only the first item, the molecule type
+ .map{ it[0] }
+ .set{ ch_refseq_moltypes_to_download }
+}
+
+
+// Parse refseq taxonomy group to download
+constitutive_refseq_taxonomy = params.constitutive_refseq_taxonomy
+/////////////////////////////////////////////////////////////
+/* -- END: Parse constitutive K-mer removal parameters -- */
+/////////////////////////////////////////////////////////////
+
+
// Has the run name been specified by the user?
// this has the bonus effect of catching both -name and --name
custom_runName = params.name
@@ -588,16 +665,22 @@ if (params.sketch_num_hashes_log2) summary['Sketch Sizes (log2)'] = params.
if (params.sketch_scaled) summary['Sketch scaled'] = params.sketch_scaled
if (params.sketch_scaled_log2) summary['Sketch scaled (log2)'] = params.sketch_scaled_log2
// 10x parameters
-if(params.tenx_tgz) summary["10x .tgz"] = params.tenx_tgz
-if(params.tenx_tgz) summary["10x SAM tags"] = params.tenx_tags
-if(params.tenx_tgz) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern
-if(params.tenx_tgz) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern
-if(params.tenx_tgz) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell
-// Extract coding parameters
-if(params.reference_proteome_fasta) summary["Peptide fasta"] = params.reference_proteome_fasta
-if(params.reference_proteome_fasta) summary['Peptide ksize'] = params.translate_peptide_ksize
-if(params.reference_proteome_fasta) summary['Peptide molecule'] = params.translate_peptide_molecule
-if(params.reference_proteome_fasta) summary['Bloom filter table size'] = params.bloomfilter_tablesize
+if(params.tenx_tgz || params.bam) summary["10x .tgz"] = params.tenx_tgz
+if(params.tenx_tgz || params.bam) summary["10x SAM tags"] = params.tenx_tags
+if(params.tenx_tgz || params.bam) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern
+if(params.tenx_tgz || params.bam) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern
+if(params.tenx_tgz || params.bam) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell
+// Orpheum Translate parameters
+if(params.translate_proteome_fasta) summary["Orpheum Translate Peptide fasta"] = params.translate_proteome_fasta
+if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide ksize'] = params.translate_peptide_ksize
+if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide molecule'] = params.translate_peptide_molecule
+if(params.translate_proteome_fasta) summary['Oprheum Translate Bloom filter table size'] = params.bloomfilter_tablesize
+// constitutive k-mer removal paramters
+if(params.constitutive_protein_fasta) summary["Constitutive Peptide fasta"] = params.constitutive_protein_fasta
+if(params.constitutive_rna_fasta) summary["Constitutive RNA fasta"] = params.constitutive_rna_fasta
+if(params.constitutive_protein_sig) summary["Constitutive Peptide K-mer Signature"] = params.constitutive_protein_sig
+if(params.constitutive_rna_sig) summary["Constitutive RNA K-mer Signature"] = params.constitutive_rna_sig
+if(need_refseq_download) summary["Constitutive GBenes' Refseq Taxonomy"] = params.constitutive_refseq_taxonomy
// Resource information
summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
@@ -669,10 +752,11 @@ process get_software_versions {
bam2fasta info &> v_bam2fasta.txt
fastp --version &> v_fastp.txt
samtools --version &> v_samtools.txt
+ rsync --version &> v_rsync.txt
ska version &> v_ska.txt
- sortmerna --version &> v_sortmerna.txt
sourmash -v &> v_sourmash.txt
pip show orpheum &> v_orpheum.txt
+ python --version &> v_python.txt
scrape_software_versions.py &> software_versions_mqc.yaml
"""
}
@@ -743,7 +827,7 @@ if ( !params.split_kmer && have_sketch_value ) {
-if (params.reference_proteome_fasta){
+if (params.translate_proteome_fasta){
process make_protein_index {
tag "${peptides}__${bloom_id}"
label "low_memory"
@@ -751,7 +835,7 @@ if (params.reference_proteome_fasta){
publishDir "${params.outdir}/protein_index", mode: params.publish_dir_mode
input:
- file(peptides) from ch_reference_proteome_fasta
+ file(peptides) from ch_translate_proteome_fasta
translate_peptide_ksize
translate_peptide_molecule
@@ -853,8 +937,8 @@ if (params.tenx_tgz || params.bam) {
// Put fastqs from aligned and unaligned reads into a single channel
tenx_reads_aligned_concatenation_ch
.mix( tenx_reads_unaligned_ch )
- .dump(tag: "tenx_ch_reads_for_ribosomal_removal")
- .set{ tenx_ch_reads_for_ribosomal_removal }
+ .dump(tag: "tenx_ch_reads_to_translate")
+ .set{ tenx_ch_reads_to_translate }
if ((params.tenx_min_umi_per_cell > 0) || !params.barcodes_file) {
process count_umis_per_cell {
@@ -900,14 +984,14 @@ if (params.tenx_tgz || params.bam) {
good_barcodes_ch = tenx_bam_barcodes_ch
}
- tenx_ch_reads_for_ribosomal_removal
+ tenx_ch_reads_to_translate
.combine( good_barcodes_ch, by: 0 )
- .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch' )
+ .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch' )
.map{ it -> [it[0], it[1], it[2], it[3].splitText()] }
.transpose()
- .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch__transpose' )
+ .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch__transpose' )
.map{ it -> [it[0], it[1], it[2], it[3].replaceAll("\\s+", "") ] }
- .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch__transpose__no_newlines' )
+ .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch__transpose__no_newlines' )
.set{ tenx_reads_with_good_barcodes_ch }
process extract_per_cell_fastqs {
@@ -949,8 +1033,8 @@ if (params.tenx_tgz || params.bam) {
// // Filtering out fastq.gz files less than 200 bytes (arbitary number)
// // ~200 bytes is about the size of a file with a single read or less
// // We can't use .size() > 0 because it's fastq.gz is gzipped content
- // per_channel_cell_ch_reads_for_ribosomal_removal
- // .dump(tag: 'per_channel_cell_ch_reads_for_ribosomal_removal')
+ // per_channel_cell_ch_reads_to_translate
+ // .dump(tag: 'per_channel_cell_ch_reads_to_translate')
// .flatten()
// .filter{ it -> it.size() > 200 } // each item is just a single file, no need to do it[1]
// .map{ it -> tuple(it.simpleName, file(it)) }
@@ -960,7 +1044,7 @@ if (params.tenx_tgz || params.bam) {
if (params.skip_trimming) {
ch_non_bam_reads
.concat(per_cell_fastqs_ch)
- .set { ch_reads_for_ribosomal_removal }
+ .set { ch_reads_to_translate }
} else {
ch_non_bam_reads
.mix ( per_cell_fastqs_ch )
@@ -1053,13 +1137,13 @@ if ( have_nucleotide_input ) {
ch_reads_trimmed
.concat( fastas_ch )
.dump ( tag: 'trimmed_reads__concat_fastas' )
- .set { subsample_ch_reads_for_ribosomal_removal }
+ .set { subsample_ch_reads_to_translate }
} else {
// Concatenate trimmed reads with fastas for signature generation
- ch_reads_for_ribosomal_removal = ch_reads_trimmed.mix(fastas_ch)
+ ch_reads_to_translate = ch_reads_trimmed.concat(fastas_ch)
}
} else {
- ch_reads_for_ribosomal_removal = fastas_ch
+ ch_reads_to_translate = fastas_ch
ch_fastp_results = Channel.from(false)
}
@@ -1069,10 +1153,10 @@ if (params.subsample) {
publishDir "${params.outdir}/seqtk/", mode: params.publish_dir_mode
input:
- set val(id), file(reads) from subsample_ch_reads_for_ribosomal_removal
+ set val(id), file(reads) from subsample_ch_reads_to_translate
output:
- set val(id), file("*_${params.subsample}.fastq.gz") into ch_reads_for_ribosomal_removal
+ set val(id), file("*_${params.subsample}.fastq.gz") into ch_reads_to_translate
script:
read1 = reads[0]
@@ -1087,101 +1171,8 @@ if (params.subsample) {
}
}
-/*
- * STEP 2+ - SortMeRNA - remove rRNA sequences on request
- */
-if (!params.remove_ribo_rna) {
- ch_reads_for_ribosomal_removal
- .set { ch_reads_to_translate }
- sortmerna_logs = Channel.empty()
-} else {
- process sortmerna_index {
- label 'mid_memory_long'
- label 'mid_cpu'
- tag "${fasta.baseName}"
-
- input:
- file(fasta) from sortmerna_fasta
-
- output:
- val("${fasta.baseName}") into sortmerna_db_name
- file("$fasta") into sortmerna_db_fasta
- file("${fasta.baseName}*") into sortmerna_db
-
- script:
- """
- indexdb_rna --ref $fasta,${fasta.baseName} -m 3072 -v
- """
- }
-
- process sortmerna {
- label 'mid_memory_long'
- label 'mid_cpu'
- tag "$name"
- publishDir "${params.outdir}/SortMeRNA", mode: "${params.publish_dir_mode}",
- saveAs: {filename ->
- if (filename.indexOf("_rRNA_report.txt") > 0) "logs/$filename"
- else if (params.save_non_rrna_reads) "reads/$filename"
- else null
- }
-
- input:
- set val(name), file(reads) from ch_reads_for_ribosomal_removal
- val(db_name) from sortmerna_db_name.collect()
- file(db_fasta) from sortmerna_db_fasta.collect()
- file(db) from sortmerna_db.collect()
-
- output:
- set val(name), file("*.fq.gz") into ch_reads_to_translate
- file "*_rRNA_report.txt" into sortmerna_logs
-
- script:
- //concatenate reference files: ${db_fasta},${db_name}:${db_fasta},${db_name}:...
- def Refs = ''
- for (i=0; i single end
- if (reads[1] == null) {
- """
- gzip -d --force < ${reads} > all-reads.fastq
- sortmerna --ref ${Refs} \
- --reads all-reads.fastq \
- --num_alignments 1 \
- -a ${task.cpus} \
- --fastx \
- --aligned rRNA-reads \
- --other non-rRNA-reads \
- --log -v
- gzip --force < non-rRNA-reads.fastq > ${name}.fq.gz
- mv rRNA-reads.log ${name}_rRNA_report.txt
- """
- } else {
- """
- gzip -d --force < ${reads[0]} > reads-fw.fq
- gzip -d --force < ${reads[1]} > reads-rv.fq
- merge-paired-reads.sh reads-fw.fq reads-rv.fq all-reads.fastq
- sortmerna --ref ${Refs} \
- --reads all-reads.fastq \
- --num_alignments 1 \
- -a ${task.cpus} \
- --fastx --paired_in \
- --aligned rRNA-reads \
- --other non-rRNA-reads \
- --log -v
- unmerge-paired-reads.sh non-rRNA-reads.fastq non-rRNA-reads-fw.fq non-rRNA-reads-rv.fq
- gzip < non-rRNA-reads-fw.fq > ${name}-fw.fq.gz
- gzip < non-rRNA-reads-rv.fq > ${name}-rv.fq.gz
- mv rRNA-reads.log ${name}_rRNA_report.txt
- """
- }
- }
- }
-
-
-
- if (params.reference_proteome_fasta){
+ if (params.translate_proteome_fasta){
process translate {
tag "${sample_id}"
label "low_memory_long"
@@ -1375,7 +1366,7 @@ if (!have_nucleotide_input) {
}
-if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
+if (!params.skip_compute && (protein_input || params.translate_proteome_fasta)){
process sourmash_compute_sketch_fastx_peptide {
tag "${sig_id}"
@@ -1524,7 +1515,6 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
}
-
ch_sourmash_sketches_merged_to_view
.dump( tag: "ch_sourmash_sketches_to_view" )
@@ -1545,6 +1535,274 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_
ch_sourmash_sig_describe_merged = Channel.empty()
}
+
+if (!params.skip_remove_constitutive_genes) {
+ ///////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////
+ /* -- -- */
+ /* -- REMOVE K-MERS FROM constitutive GENES -- */
+ /* -- -- */
+ ///////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////
+
+
+
+ ///////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////
+ /* -- -- */
+ /* -- DOWNLOAD NUCLEOTIDE AND PROTEIN SEQS FROM REFSEQ -- */
+ /* -- -- */
+ ///////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////
+ /*
+ * STEP 6 - rsync to download refeseq
+ */
+ if (need_refseq_download){
+ // No fastas provided for removing constitutive genes
+ process download_refseq {
+ tag "${constitutive_refseq_taxonomy}--${refseq_moltype}"
+ label "process_low"
+ publishDir "${params.outdir}/reference/ncbi_refseq/", mode: 'copy'
+
+ input:
+ val refseq_moltype from ch_refseq_moltypes_to_download
+
+ output:
+ set val(refseq_moltype), file("${constitutive_refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter
+
+ script:
+ output_fasta = "${constitutive_refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz"
+ include_fasta = params.test_mini_refseq_download ? "${constitutive_refseq_taxonomy}.1.${refseq_moltype}.f*a.gz" : "*${refseq_moltype}.f*a.gz"
+ """
+ rsync \\
+ --prune-empty-dirs \\
+ --archive \\
+ --verbose \\
+ --recursive \\
+ --include '${include_fasta}' \\
+ --exclude '/*' \\
+ rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${constitutive_refseq_taxonomy}/ .
+ wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER
+ DATE=\$(date +'%Y-%m-%d')
+ RELEASE_NUMBER=\$(cat RELEASE_NUMBER)
+ zcat ${constitutive_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta}
+ """
+ }
+
+ ///////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////
+ /* -- -- */
+ /* -- REMOVE K-MERS FROM constitutive GENES -- */
+ /* -- -- */
+ ///////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////
+ /*
+ * STEP 7 - Get only constitutive genes from
+ */
+ // Keep genes whose names match constitutive gene regular expression pattern
+ process extract_fasta_constitutive {
+ tag "${fasta.baseName}"
+ label "process_low"
+ publishDir "${params.outdir}/reference/constitutive_genes/", mode: 'copy'
+
+ input:
+ set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter
+
+ output:
+ set val(refseq_moltype), file(output_fasta_gz) into ch_constitutive_fasta, ch_constitutive_fasta_to_view
+
+ script:
+ output_fasta = "${fasta.baseName}__only_constitutive_genes.fa"
+ output_fasta_gz = "${fasta.baseName}__only_constitutive_genes.fa.gz"
+ """
+ filter_fasta_regex.py \\
+ --input-fasta ${fasta} \\
+ --output-fasta ${output_fasta} \\
+ --regex-pattern '${params.constitutive_gene_regex}'
+ gzip -c ${output_fasta} > ${output_fasta_gz}
+ """
+ }
+
+ ch_constitutive_fasta_to_view
+ .dump( tag: 'ch_constitutive_fasta' )
+ }
+
+ if (!have_constitutive_sigs) {
+ ///////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////
+ /* -- -- */
+ /* -- COMPUTE constitutive GENE K-MER SIGNATURE -- */
+ /* -- -- */
+ ///////////////////////////////////////////////////////////////////////////////
+ ///////////////////////////////////////////////////////////////////////////////
+ /*
+ * STEP 8 - Compute constitutive Gene K-mer Signature
+ */
+ // No fastas provided for removing constitutive genes
+ process compute_constitutive_kmer_sig {
+ tag "${fasta.baseName}"
+ label "process_low"
+ publishDir "${params.outdir}/reference/constitutive_genes/", mode: 'copy'
+
+ input:
+ val track_abundance
+ val sketch_value_parsed
+ val sketch_style_parsed
+ set val(refseq_moltype), file(fasta) from ch_constitutive_fasta
+
+ output:
+ set val(sourmash_moltypes), file(sig) into ch_constitutive_sig
+
+ script:
+ is_protein = refseq_moltype == "protein"
+ sourmash_moltype = is_protein ? "protein,dayhoff" : 'dna'
+ sourmash_moltypes = tuple(sourmash_moltype.split(","))
+ sketch_id = make_sketch_id(
+ sourmash_moltype,
+ params.ksizes,
+ sketch_value_parsed[0],
+ track_abundance,
+ sketch_style_parsed[0]
+ )
+
+ sketch_value_flag = make_sketch_value_flag(
+ sketch_style_parsed[0],
+ sketch_value_parsed[0]
+ )
+ moltype_flags = is_protein ? '--protein --dayhoff --input-is-protein' : '--dna'
+ track_abundance_flag = track_abundance ? '--track-abundance' : ''
+ sig_id = "${fasta.baseName}__${sketch_id}"
+ sig = "${sig_id}.sig"
+ csv = "${sig_id}.csv"
+ """
+ sourmash compute \\
+ ${sketch_value_flag} \\
+ --ksizes ${params.ksizes} \\
+ ${moltype_flags} \\
+ ${track_abundance_flag} \\
+ --output ${sig} \\
+ --name '${fasta.baseName}' \\
+ ${fasta}
+ sourmash sig describe --csv ${csv} ${sig}
+ """
+ }
+ }
+
+
+ ch_sourmash_sketches_merged
+ // index 2: moltypes
+ // index 4: signature
+ .map { tuple( tuple(it[2].split(",")), it[4] ) }
+ .transpose()
+ .dump( tag: 'ch_sourmash_sketches_moltype_to_sig' )
+ .groupTuple( by: 0 )
+ .dump( tag: 'ch_sourmash_sketches_moltype_to_sig__groupTuple' )
+ .set { ch_sourmash_sketches_moltype_to_sigs }
+
+ ch_constitutive_sig
+ .dump( tag: 'ch_constitutive_sig' )
+ .transpose()
+ .dump( tag: 'ch_constitutive_sig__transposed' )
+ .combine( ch_sourmash_params_for_subtract, by: 0)
+ .dump( tag: 'ch_constitutive_sig__transposed__combined' )
+ .combine ( ch_sourmash_sketches_moltype_to_sigs, by: 0 )
+ .dump( tag: 'ch_constitutive_sig__transposed__combined_joined' )
+ .into { ch_subtract_params_with_sigs; ch_subtract_params_to_sigs_for_siglist }
+
+ ch_subtract_params_to_sigs_for_siglist
+ .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist' )
+ .transpose()
+ .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose')
+ .collectFile() { it ->
+ [ "${it[0]}__${it[2]}.txt", "${it[3].getFileName()}\n"]
+ }
+ .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose__collectfile' )
+ .map { [ tuple( it.baseName.split('__') ), it] }
+ .map { [ it[0][0], it[0][1], it[1] ] }
+ // .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose__collectfile__map' )
+ // .transpose()
+ .dump ( tag: 'ch_subtract_params_with_siglist' )
+ .set { ch_subtract_params_with_siglist }
+
+ ch_subtract_params_with_sigs
+ // Reorder so molecule (it[0]) and ksize (it[2]) are first
+ .map{ [it[0], it[2], it[1], it[3]] }
+ .dump ( tag: 'ch_subtract_params_with_sigs__map' )
+ .combine( ch_subtract_params_with_siglist, by: [0, 1] )
+ .dump( tag: 'ch_sigs_with_constitutive_sig_to_subtract' )
+ .set { ch_sigs_with_constitutive_sig_to_subtract }
+
+
+ // ///////////////////////////////////////////////////////////////////////////////
+ // ///////////////////////////////////////////////////////////////////////////////
+ // /* -- -- */
+ // /* -- REMOVE K-MERS FROM constitutive GENES -- */
+ // /* -- -- */
+ // ///////////////////////////////////////////////////////////////////////////////
+ // ///////////////////////////////////////////////////////////////////////////////
+ // /*
+ // * STEP 9 - Remove constitutive gene k-mers from single cells
+ // */
+ process subtract_constitutive_kmers {
+ tag "${subtract_id}"
+ label "process_medium"
+ publishDir "${params.outdir}/sketches_subtract_constitutive_kmers/${subtract_id}", mode: 'copy'
+
+ input:
+ val sketch_value_parsed
+ val sketch_style_parsed
+ set val(molecule), val(ksize), file(constitutive_sig), file(sigs), file(siglist) from ch_sigs_with_constitutive_sig_to_subtract
+
+ output:
+ set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_constitutive_removed
+
+ script:
+ subtract_id = "${molecule}__k-${ksize}"
+ sketch_value_flag = make_sketch_value_flag(
+ sketch_style_parsed[0],
+ sketch_value_parsed[0]
+ )
+ track_abundance_flag = track_abundance ? '--track-abundance' : ''
+
+ """
+ subtract \\
+ ${track_abundance_flag} \\
+ ${sketch_value_flag} \\
+ --ksize ${ksize} \\
+ --encoding ${molecule} \\
+ --output subtracted/ \\
+ ${constitutive_sig} \\
+ ${siglist}
+ """
+ }
+
+ ch_sigs_constitutive_removed
+ // .groupTuple( by: [0, 1] )
+ .transpose( by: 2 )
+ .set{ ch_sourmash_sketches_to_compare }
+
+} else {
+ ch_sourmash_sketches_merged
+ .map { [tuple(it[2].split(",")), it[4]] }
+ .dump(tag: 'ch_sourmash_sketches_merged__map_split' )
+ .transpose()
+ .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' )
+ // Perform cartesian product on the molecules with compare params
+ .combine( ch_sourmash_params_for_compare, by: 0)
+ .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' )
+ // .groupTuple(by: [0, 2])
+ .dump(tag: 'ch_sourmash_sketches_to_compare' )
+ // Reorder so signature files are last
+ // moltype, ksize, signature file
+ .map { [it[0], it[2], it[1]] }
+ .set { ch_sourmash_sketches_to_compare }
+
+ ch_sourmash_sig_describe_merged = Channel.empty()
+}
+
+
+
+
if (params.split_kmer){
process ska_compare_sketches {
tag "${sketch_id}"
@@ -1566,57 +1824,23 @@ if (params.split_kmer){
}
// If skip_compute is true, skip compare must be specified as true as well
if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
- // // Combine peptide and nucleotide sketches
- // sourmash_sketches_nucleotide
- // .collect()
- // // Set as a list so that combine does cartesian product of all signatures
- // .map { it -> [it] }
- // .combine( ch_ksizes_for_compare_nucleotide )
- // .dump( tag: 'sourmash_sketches_nucleotide__ksizes' )
- // .map { x -> [x[0], x[1], 'dna'] }
- // .dump( tag: 'sourmash_sketches_nucleotide__ksizes__molecules' )
- // .set { sourmash_sketches_nucleotide_for_compare }
-
- // sourmash_sketches_peptide
- // .collect()
- // // Set as a list so that combine does cartesian product of all signatures
- // .map { it -> [it] }
- // .combine( ch_ksizes_for_compare_petide )
- // .dump( tag: 'sourmash_sketches_peptide__ksizes' )
- // .combine( ch_peptide_molecules )
- // .dump( tag: 'sourmash_sketches_peptide__ksizes__molecules' )
- // .set { sourmash_sketches_peptide_for_compare }
-
- // sourmash_sketches_peptide_for_compare
- // .mix ( sourmash_sketches_nucleotide_for_compare )
- // .set { ch_sourmash_sketches_to_compare }
-
- // ch_sourmash_sketches_to_compare = Channel.empty()
-
- ch_peptide_molecules_for_compare
- .combine( ch_ksizes_for_compare_peptide )
- .set { ch_sourmash_compare_params_peptide }
-
- Channel.from("dna")
- .combine( ch_ksizes_for_compare_nucleotide )
- .mix ( ch_sourmash_compare_params_peptide )
- .set { ch_sourmash_compare_params_both }
+ ch_sourmash_compare_sketch_params_to_sketches = Channel.create()
- ch_sourmash_sketches_merged
- // Drop first index (index 0) which is the cell id
- // Drop the second index (index 1) which is the sketch id
- // Keep only moltype
- // Drop ksize
- .map { [tuple(it[2].split(",")), it[4]] }
- .dump(tag: 'ch_sourmash_sketches_merged__map_split' )
- .transpose()
- .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' )
- // Perform cartesian product on the molecules with compare params
- .combine( ch_sourmash_compare_params_both, by: 0)
- .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' )
- .groupTuple(by: [0, 2])
- .dump(tag: 'ch_sourmash_sketches_to_compare' )
- .set { ch_sourmash_sketches_to_compare }
+ ch_sourmash_sketches_to_compare
+ .tap ( ch_sourmash_compare_sketch_params_to_sketches )
+ .dump( tag: 'ch_compare_params_to_sigs_for_siglist__transpose' )
+ .collectFile() { it ->
+ [ "${it[0]}__${it[1]}.txt", "${it[2].getFileName()}\n"]
+ }
+ .dump ( tag: 'ch_compare_params_to_sigs_for_siglist__transpose__collectfile' )
+ .map { [ tuple( it.baseName.split('__') ), it] }
+ .map { [ it[0][0], it[0][1], it[1] ] }
+ .dump ( tag: 'ch_compare_params_with_siglist' )
+ .combine( ch_sourmash_compare_sketch_params_to_sketches, by: [0, 1] )
+ .dump( tag: 'ch_compare_params_with_siglist__add_sketches' )
+ .groupTuple( by: [0, 1, 2] )
+ .dump ( tag: 'ch_compare_params_with_siglist__add_sketches__groupTuple' )
+ .set { ch_sourmash_params_to_siglist_sketches }
process sourmash_compare_sketches {
// Combine peptide and nucleotide sketches
@@ -1624,8 +1848,8 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
publishDir "${params.outdir}/compare_sketches", mode: 'copy'
input:
- // Weird order but that's how it shakes out with the groupTuple
- set val(molecule), file("*.sig"), val(ksize) from ch_sourmash_sketches_to_compare
+ // file(sigs) is necessary to stage all the signature files present in file(siglist)
+ set val(molecule), val(ksize), file(siglist), file(sigs) from ch_sourmash_params_to_siglist_sketches
output:
file(csv)
@@ -1640,7 +1864,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) {
--${molecule} \\
--csv ${csv} \\
${processes} \\
- --traverse-directory .
+ --from-file ${siglist}
# Use --traverse-directory instead of all the files explicitly to avoid
# "too many arguments" error for bash when there are lots of samples
"""
@@ -1662,7 +1886,6 @@ if (!params.skip_multiqc){
file ("sourmash_describe_peptides/") from ch_sourmash_sig_describe_peptides.collect().ifEmpty([])
file ("sourmash_describe_nucleotides/") from ch_sourmash_sig_describe_nucleotides.collect().ifEmpty([])
file ('fastp/*') from ch_fastp_results.collect().ifEmpty([])
- file ('sortmerna/*') from sortmerna_logs.collect().ifEmpty([])
file ('software_versions/*') from ch_software_versions_yaml.collect()
file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml")
diff --git a/nextflow.config b/nextflow.config
index 589d186f..7bdc2902 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -16,7 +16,7 @@ params {
fastas = false
protein_fastas = false
sra = false
-
+ bam = false
input = false
// Parsing 10x bam files
@@ -34,7 +34,7 @@ params {
// Number of hashes from each sample
sketch_num_hashes = false
sketch_num_hashes_log2 = false
- sketch_scaled = false
+ sketch_scaled = 10
sketch_scaled_log2 = false
skip_sig_merge = false
@@ -44,24 +44,30 @@ params {
// Computing sketches
skip_compute = false
+ // Skip trimming of adapters and poly-X sequences
skip_trimming = false
// translate options
translate_peptide_ksize = 8
translate_peptide_molecule = 'protein'
translate_jaccard_threshold = 0.05
- reference_proteome_fasta = false
+ translate_proteome_fasta = false
bloomfilter_tablesize = '1e8'
// Saving the translate results for each dataset makes it take extra long
// Recommended for debugging purposes only
save_translate_csv = false
save_translate_json = false
-
- // Ribosomal RNA removal
- remove_ribo_rna = false
- save_non_rrna_reads = false
- rrna_database_manifest = "${baseDir}/assets/rrna-db-defaults.txt"
+ // constitutive gene k-mer removal
+ skip_remove_constitutive_genes = false
+ constitutive_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH"
+ constitutive_refseq_taxonomy = 'vertebrate_mammalian'
+ // For testing purposes --> use a small refseq dataset
+ test_mini_refseq_download = false
+ constitutive_protein_fasta = false
+ constitutive_rna_fasta = false
+ constitutive_protein_sig = false
+ constitutive_rna_sig = false
// ska options
split_kmer = false
@@ -71,7 +77,7 @@ params {
save_fastas = "fastas"
tenx_min_umi_per_cell = '0'
write_barcode_meta_csv = false
- bam = false
+
// 10x optional input parameters set using the below pattern
// https://github.com/nextflow-io/patterns/blob/master/docs/optional-input.adoc
@@ -150,6 +156,9 @@ profiles {
test_ska { includeConfig 'conf/test_ska.config' }
test_bam { includeConfig 'conf/test_bam.config' }
test_fastas { includeConfig 'conf/test_fastas.config' }
+ test_constitutive_from_download_refseq { includeConfig 'conf/test_constitutive_from_download_refseq.config' }
+ test_constitutive_from_fasta { includeConfig 'conf/test_constitutive_from_fasta.config' }
+ test_constitutive_from_sig { includeConfig 'conf/test_constitutive_from_sig.config' }
test_protein_fastas { includeConfig 'conf/test_protein_fastas.config' }
test_remove_ribo { includeConfig 'conf/test_remove_ribo.config' }
test_sig_merge { includeConfig 'conf/test_sig_merge.config' }
diff --git a/scratch.nf b/scratch.nf
new file mode 100644
index 00000000..617d336d
--- /dev/null
+++ b/scratch.nf
@@ -0,0 +1,10 @@
+housekeeping_protein_fasta = false
+housekeeping_rna_fasta = true
+
+ch_refseq_moltype_to_fasta = Channel.from(["protein", housekeeping_protein_fasta], ["rna", housekeeping_rna_fasta])
+ch_refseq_moltype_to_fasta
+ // filter if the second item, the fasta is false
+ .filter{ !it[1] }
+ // Take only the first item, the molecule type
+ .map{ it[0] }
+ .println()
diff --git a/siglist.txt b/siglist.txt
new file mode 100644
index 00000000..43fc713a
--- /dev/null
+++ b/siglist.txt
@@ -0,0 +1,4 @@
+SRR4238351__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig
+SRR4238355__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig
+SRR4050380__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig
+SRR4050379__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig