From 7b147ba9a5d146718becdc1577f01f3883910c24 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 9 Mar 2021 12:41:11 -0800 Subject: [PATCH 01/43] Remove SortMeRNA from requirements --- environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 403f1a6b..37c5b912 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - conda-forge::pymdown-extensions=6.0 - conda-forge::pygments=2.5.2 - conda-forge::tqdm=4.43.0 - - conda-forge::gxx_linux-64=7.3.0 + # - conda-forge::gxx_linux-64=7.3.0 - conda-forge::s3fs=0.4.2 - bioconda::sourmash=3.5.0 - bioconda::samtools=1.10 @@ -33,8 +33,8 @@ dependencies: - ska=1.0 - sphinx=2.3.1 - jupyter=1.0.0 - - sortmerna=2.1b # for metatranscriptomics - ripgrep=12.1.1 + - conda-forge::rust=1.48.0 - pip: - bam2fasta==1.0.8 - sencha==1.0.3 \ No newline at end of file From b3d415cebdeb8c815e5b5ff6bdbf3382acaf1106 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 9 Mar 2021 12:41:40 -0800 Subject: [PATCH 02/43] Add Luiz's remove-many code --- Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Dockerfile b/Dockerfile index 46764af7..e516001e 100755 --- a/Dockerfile +++ b/Dockerfile @@ -12,6 +12,12 @@ ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH # Dump the details of the installed packages to a file for posterity RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0dev.yml +# Install super fast rust code to remove nuisance hashes (e.g. ribosomal) from signatures +RUN git clone https://github.com/luizirber/2021-01-27-olga-remove-protein/ +RUN cd 2021-01-27-olga-remove-protein && cargo build --release +# Add "subtract" command to path +ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH + # Instruct R processes to use these empty files instead of clashing with a local version RUN touch .Rprofile RUN touch .Renviron From 1a4855594f13d4f7e6fcdece0242e1d3268b5a1b Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 9 Mar 2021 12:41:59 -0800 Subject: [PATCH 03/43] Remove SortMeRNA --- main.nf | 251 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 131 insertions(+), 120 deletions(-) diff --git a/main.nf b/main.nf index 8b9bf3d6..c0d1900c 100644 --- a/main.nf +++ b/main.nf @@ -386,12 +386,12 @@ if (!protein_input) { if (params.subsample && params.skip_trimming ) { subsample_reads_ch_unchecked .ifEmpty{ exit 1, "No reads provided! Check read input files" } - .set { subsample_ch_reads_for_ribosomal_removal } + .set { subsample_ch_reads_to_translate } } if (params.skip_trimming && !(params.bam || params.tenx_tgz)) { reads_ch_unchecked .ifEmpty{ exit 1, "No reads provided! Check read input files" } - .set { ch_reads_for_ribosomal_removal } + .set { ch_reads_to_translate } ch_read_files_trimming_to_check_size = Channel.empty() } else if (params.bam || params.tenx_tgz) { ch_non_bam_reads_unchecked @@ -407,11 +407,11 @@ if (!protein_input) { // Since there exists protein input, don't check if these are empty if (params.subsample) { subsample_reads_ch_unchecked - .set { subsample_ch_reads_for_ribosomal_removal } + .set { subsample_ch_reads_to_translate } } if (params.skip_trimming) { reads_ch_unchecked - .set { ch_reads_for_ribosomal_removal } + .set { ch_reads_to_translate } ch_read_files_trimming_to_check_size = Channel.empty() } else if (!have_nucleotide_fasta_input) { ch_read_files_trimming_unchecked @@ -430,15 +430,6 @@ if (params.split_kmer){ params.ksizes = '21,27,33,51' } -// Get rRNA databases -// Default is set to bundled DB list in `assets/rrna-db-defaults.txt` - -rRNA_database = file(params.rrna_database_manifest) -if (rRNA_database.isEmpty()) {exit 1, "File ${rRNA_database.getName()} is empty!"} -Channel - .from( rRNA_database.readLines() ) - .map { row -> file(row) } - .set { sortmerna_fasta } // --- Parse Translate parameters --- save_translate_csv = params.save_translate_csv @@ -524,6 +515,26 @@ else { barcode_metadata_folder = "barcode_metadata" } + +////////////////////////////////////////////////////////// +/* -- Parse Housekeeping K-mer removal parameters -- */ +///////////////////////////////////////////////////////// +housekeeping_protein_fasta = params.housekeeping_protein_fasta +housekeeping_rna_fasta = params.housekeeping_rna_fasta + +need_refseq_download = !housekeeping_protein_fasta && !housekeeping_rna_fasta + +ch_refseq_moltype_to_fasta = Channel.from(["protein", housekeeping_protein_fasta], ["rna", housekeeping_rna_fasta]) +ch_refseq_moltype_to_fasta + // filter if the second item, the fasta is false + .filter{ !it[1] } + // Take only the first item, the molecule type + .map{ it[0] } + .set{ ch_refseq_moltypes_to_download } + +// Parse refseq taxonomy group to download +refseq_taxonomy = params.refseq_taxonomy + // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name custom_runName = params.name @@ -851,8 +862,8 @@ if (params.tenx_tgz || params.bam) { // Put fastqs from aligned and unaligned reads into a single channel tenx_reads_aligned_concatenation_ch .mix( tenx_reads_unaligned_ch ) - .dump(tag: "tenx_ch_reads_for_ribosomal_removal") - .set{ tenx_ch_reads_for_ribosomal_removal } + .dump(tag: "tenx_ch_reads_to_translate") + .set{ tenx_ch_reads_to_translate } if ((params.tenx_min_umi_per_cell > 0) || !params.barcodes_file) { process count_umis_per_cell { @@ -898,14 +909,14 @@ if (params.tenx_tgz || params.bam) { good_barcodes_ch = tenx_bam_barcodes_ch } - tenx_ch_reads_for_ribosomal_removal + tenx_ch_reads_to_translate .combine( good_barcodes_ch, by: 0 ) - .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch' ) + .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch' ) .map{ it -> [it[0], it[1], it[2], it[3].splitText()] } .transpose() - .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch__transpose' ) + .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch__transpose' ) .map{ it -> [it[0], it[1], it[2], it[3].replaceAll("\\s+", "") ] } - .dump( tag: 'tenx_ch_reads_for_ribosomal_removal__combine__good_barcodes_ch__transpose__no_newlines' ) + .dump( tag: 'tenx_ch_reads_to_translate__combine__good_barcodes_ch__transpose__no_newlines' ) .set{ tenx_reads_with_good_barcodes_ch } process extract_per_cell_fastqs { @@ -949,8 +960,8 @@ if (params.tenx_tgz || params.bam) { // // Filtering out fastq.gz files less than 200 bytes (arbitary number) // // ~200 bytes is about the size of a file with a single read or less // // We can't use .size() > 0 because it's fastq.gz is gzipped content - // per_channel_cell_ch_reads_for_ribosomal_removal - // .dump(tag: 'per_channel_cell_ch_reads_for_ribosomal_removal') + // per_channel_cell_ch_reads_to_translate + // .dump(tag: 'per_channel_cell_ch_reads_to_translate') // .flatten() // .filter{ it -> it.size() > 200 } // each item is just a single file, no need to do it[1] // .map{ it -> tuple(it.simpleName, file(it)) } @@ -960,7 +971,7 @@ if (params.tenx_tgz || params.bam) { if (params.skip_trimming) { ch_non_bam_reads .concat(per_cell_fastqs_ch) - .set { ch_reads_for_ribosomal_removal } + .set { ch_reads_to_translate } } else { ch_non_bam_reads .mix ( per_cell_fastqs_ch ) @@ -1053,10 +1064,10 @@ if ( have_nucleotide_input ) { ch_reads_trimmed .concat( fastas_ch ) .dump ( tag: 'trimmed_reads__concat_fastas' ) - .set { subsample_ch_reads_for_ribosomal_removal } + .set { subsample_ch_reads_to_translate } } else { // Concatenate trimmed reads with fastas for signature generation - ch_reads_for_ribosomal_removal = ch_reads_trimmed.concat(fastas_ch) + ch_reads_to_translate = ch_reads_trimmed.concat(fastas_ch) } } else { ch_fastp_results = Channel.from(false) @@ -1068,10 +1079,10 @@ if (params.subsample) { publishDir "${params.outdir}/seqtk/", mode: params.publish_dir_mode input: - set val(id), file(reads) from subsample_ch_reads_for_ribosomal_removal + set val(id), file(reads) from subsample_ch_reads_to_translate output: - set val(id), file("*_${params.subsample}.fastq.gz") into ch_reads_for_ribosomal_removal + set val(id), file("*_${params.subsample}.fastq.gz") into ch_reads_to_translate script: read1 = reads[0] @@ -1086,99 +1097,6 @@ if (params.subsample) { } } -/* - * STEP 2+ - SortMeRNA - remove rRNA sequences on request - */ -if (!params.remove_ribo_rna) { - ch_reads_for_ribosomal_removal - .set { ch_reads_to_translate } - sortmerna_logs = Channel.empty() -} else { - process sortmerna_index { - label 'mid_memory_long' - label 'mid_cpu' - tag "${fasta.baseName}" - - input: - file(fasta) from sortmerna_fasta - - output: - val("${fasta.baseName}") into sortmerna_db_name - file("$fasta") into sortmerna_db_fasta - file("${fasta.baseName}*") into sortmerna_db - - script: - """ - indexdb_rna --ref $fasta,${fasta.baseName} -m 3072 -v - """ - } - - process sortmerna { - label 'mid_memory_long' - label 'mid_cpu' - tag "$name" - publishDir "${params.outdir}/SortMeRNA", mode: "${params.publish_dir_mode}", - saveAs: {filename -> - if (filename.indexOf("_rRNA_report.txt") > 0) "logs/$filename" - else if (params.save_non_rrna_reads) "reads/$filename" - else null - } - - input: - set val(name), file(reads) from ch_reads_for_ribosomal_removal - val(db_name) from sortmerna_db_name.collect() - file(db_fasta) from sortmerna_db_fasta.collect() - file(db) from sortmerna_db.collect() - - output: - set val(name), file("*.fq.gz") into ch_reads_to_translate - file "*_rRNA_report.txt" into sortmerna_logs - - - script: - //concatenate reference files: ${db_fasta},${db_name}:${db_fasta},${db_name}:... - def Refs = '' - for (i=0; i single end - if (reads[1] == null) { - """ - gzip -d --force < ${reads} > all-reads.fastq - sortmerna --ref ${Refs} \ - --reads all-reads.fastq \ - --num_alignments 1 \ - -a ${task.cpus} \ - --fastx \ - --aligned rRNA-reads \ - --other non-rRNA-reads \ - --log -v - gzip --force < non-rRNA-reads.fastq > ${name}.fq.gz - mv rRNA-reads.log ${name}_rRNA_report.txt - """ - } else { - """ - gzip -d --force < ${reads[0]} > reads-fw.fq - gzip -d --force < ${reads[1]} > reads-rv.fq - merge-paired-reads.sh reads-fw.fq reads-rv.fq all-reads.fastq - sortmerna --ref ${Refs} \ - --reads all-reads.fastq \ - --num_alignments 1 \ - -a ${task.cpus} \ - --fastx --paired_in \ - --aligned rRNA-reads \ - --other non-rRNA-reads \ - --log -v - unmerge-paired-reads.sh non-rRNA-reads.fastq non-rRNA-reads-fw.fq non-rRNA-reads-rv.fq - gzip < non-rRNA-reads-fw.fq > ${name}-fw.fq.gz - gzip < non-rRNA-reads-rv.fq > ${name}-rv.fq.gz - mv rRNA-reads.log ${name}_rRNA_report.txt - """ - } - } - } - - if (params.reference_proteome_fasta){ process translate { @@ -1399,6 +1317,100 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sourmash_sketches_peptide = Channel.empty() } + +if (!params.skip_remove_housekeeping_genes) { + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- REMOVE K-MERS FROM HOUSEKEEPING GENES -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + + + + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- DOWNLOAD NUCLEOTIDE AND PROTEIN SEQS FROM REFSEQ -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* + * STEP 6 - rsync to download refeseq + */ + if (need_refseq_download){ + // No fastas provided for removing housekeeping genes + process download_refseq { + tag "${refseq_taxonomy}" + label "process_low" + publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy' + + input: + val refseq_moltype from ch_refseq_moltypes_to_download + + output: + set val(refseq_moltype), file(output_fasta) into ch_refseq_fasta_to_filter + + script: + output_fasta = "${refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz" + """ + rsync \\ + --prune-empty-dirs \\ + --archive \\ + --verbose \\ + --recursive \\ + --include '*${refseq_moltype}.faa.gz' \\ + --exclude '/*' \\ + rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${refseq_taxonomy}/ . + wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER + DATE=\$(date +'%Y-%M-%d') + RELEASE_NUMBER=\$(cat RELEASE_NUMBER) + zcat *.${refseq_moltype}.faa.gz | gzip -c - > ${output_fasta} + """ + } + } + + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- REMOVE K-MERS FROM HOUSEKEEPING GENES -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* + * STEP 7 - Filter fastas from refseq + */ + if (need_refseq_download){ + // No fastas provided for removing housekeeping genes + process filter_fasta_housekeeping { + tag "${refseq_taxonomy}" + label "process_low" + publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy' + + input: + set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter + + output: + set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta) + + script: + output_fasta = "${fasta.basename}__only_housekeeping_genes.fa" + output_fasta_gz = "${fasta.basename}__only_housekeeping_genes.fa.gz" + """ + filter_fasta_regex.py \\ + --input-fasta ${fasta} \\ + --output-fasta ${output_fasta} \\ + --regex-pattern '${params.housekeeping_gene_regex}' + gzip -c ${output_fasta} > ${output_fasta_gz} + """ + } + } +} + + + + if (params.split_kmer){ process ska_compare_sketches { tag "${sketch_id}" @@ -1481,7 +1493,6 @@ if (!params.skip_multiqc){ input: file multiqc_config from ch_multiqc_config file ('fastp/*') from ch_fastp_results.collect().ifEmpty([]) - file ('sortmerna/*') from sortmerna_logs.collect().ifEmpty([]) file ('software_versions/*') from ch_software_versions_yaml.collect() file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") From edd1dd111f75755b99ed242bdf22e8156cbe3a0f Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 9 Mar 2021 12:42:10 -0800 Subject: [PATCH 04/43] Add parameters for housekeeping gene removal --- .vscode/settings.json | 6 +++++ bin/filter_fasta_regex.py | 44 ++++++++++++++++++++++++++++++++ conf/test_download_refseq.config | 39 ++++++++++++++++++++++++++++ nextflow.config | 13 ++++++---- scratch.nf | 10 ++++++++ 5 files changed, 107 insertions(+), 5 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 bin/filter_fasta_regex.py create mode 100644 conf/test_download_refseq.config create mode 100644 scratch.nf diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..3022f6a1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "python.linting.pylintEnabled": false, + "python.linting.flake8Enabled": true, + "python.linting.enabled": true, + "python.formatting.provider": "black" +} \ No newline at end of file diff --git a/bin/filter_fasta_regex.py b/bin/filter_fasta_regex.py new file mode 100644 index 00000000..97cfb764 --- /dev/null +++ b/bin/filter_fasta_regex.py @@ -0,0 +1,44 @@ +import argparse +import re + + +import screed + + +def write_records_to_fasta(records, fasta): + with open(fasta, "w") as f: + for record in records: + f.write(f'>{record["name"]}\n{record["sequence"]}\n') + + +def filter_records(fasta, pattern): + filtered_records = [] + with screed.open(fasta) as records: + for record in records: + name = record["name"] + if re.findall(pattern, name, flags=re.I): + filtered_records.append(record) + return filtered_records + + +def filter_fasta_with_regex(fasta_to_filter, out_fasta, regex): + record_subset = filter_records(fasta_to_filter, regex) + write_records_to_fasta(record_subset, out_fasta) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Extract sequences whose names match a pattern""" + ) + parser.add_argument("--input-fasta", type=str, help="Sequence file to filter") + parser.add_argument("--output-fasta", type=str, help="File to write") + parser.add_argument( + "--regex-pattern", + type=str, + help="Regular expression pattern to match for the names of seuqences in the file", + ) + args = parser.parse_args() + + filter_fasta_with_regex( + parser.input_fasta, parser.output_fasta, parser.regex_pattern + ) diff --git a/conf/test_download_refseq.config b/conf/test_download_refseq.config new file mode 100644 index 00000000..a7db4d8d --- /dev/null +++ b/conf/test_download_refseq.config @@ -0,0 +1,39 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/kmermaid -profile test + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + // Input data + // samples = 'testing/samples.csv' + // fastas = 'testing/fastas/*.fasta' + ksizes = '3,9' + sketch_num_hashes_log2 = '2,4' + molecules = 'dna,protein,dayhoff' + // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' + // sra = "SRP016501" + remove_ribo_rna = true + save_non_rrna_reads = true + input_paths = [ + ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']], + ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']], + ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], + ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], + ] + + // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/ + // Protein fasta is 453 B + refseq_taxonomy = 'other' +} diff --git a/nextflow.config b/nextflow.config index 46d3ca99..b35df235 100644 --- a/nextflow.config +++ b/nextflow.config @@ -42,6 +42,7 @@ params { // Computing sketches skip_compute = false + // Skip trimming of adapters and poly-X sequences skip_trimming = false // translate options @@ -55,11 +56,12 @@ params { save_translate_csv = false save_translate_json = false - - // Ribosomal RNA removal - remove_ribo_rna = false - save_non_rrna_reads = false - rrna_database_manifest = "${baseDir}/assets/rrna-db-defaults.txt" + // Housekeeping gene k-mer removal + skip_remove_housekeeping_genes = false + housekeeping_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH" + refseq_taxonomy = 'vertebrate_mammalian' + housekeeping_protein_fasta = false + housekeeping_rna_fasta = false // ska options split_kmer = false @@ -145,6 +147,7 @@ profiles { podman.enabled = true } test { includeConfig 'conf/test.config' } + test_download_refseq { includeConfig 'conf/test_download_refseq.config' } test_full { includeConfig 'conf/test_full.config' } test_ska { includeConfig 'conf/test_ska.config' } test_bam { includeConfig 'conf/test_bam.config' } diff --git a/scratch.nf b/scratch.nf new file mode 100644 index 00000000..617d336d --- /dev/null +++ b/scratch.nf @@ -0,0 +1,10 @@ +housekeeping_protein_fasta = false +housekeeping_rna_fasta = true + +ch_refseq_moltype_to_fasta = Channel.from(["protein", housekeeping_protein_fasta], ["rna", housekeeping_rna_fasta]) +ch_refseq_moltype_to_fasta + // filter if the second item, the fasta is false + .filter{ !it[1] } + // Take only the first item, the molecule type + .map{ it[0] } + .println() From 5212de1395fa7c38dc379f6671f078e0272b8196 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 08:48:06 -0800 Subject: [PATCH 05/43] Add mini refseq download option for testing --- nextflow.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow.config b/nextflow.config index b35df235..95986d63 100644 --- a/nextflow.config +++ b/nextflow.config @@ -60,6 +60,8 @@ params { skip_remove_housekeeping_genes = false housekeeping_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH" refseq_taxonomy = 'vertebrate_mammalian' + // For testing purposes --> use a small refseq dataset + test_mini_refseq_download = false housekeeping_protein_fasta = false housekeeping_rna_fasta = false From f27b037f23690c38d594afaa78b0240860ecd444 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 08:48:41 -0800 Subject: [PATCH 06/43] Add raw quote strings around nf-core lint --- .github/workflows/linting.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 6f2be6b0..a3b25f18 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -53,7 +53,7 @@ jobs: run: | python -m pip install --upgrade pip pip install nf-core - +{% raw %} - name: Run nf-core lint env: GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} @@ -74,3 +74,4 @@ jobs: lint_log.txt lint_results.md PR_number.txt +{% endraw %} \ No newline at end of file From c1e9c593c9305a8993f2c8ac7f41c7f6f6e66366 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 08:48:54 -0800 Subject: [PATCH 07/43] Get fasta filtering working --- bin/filter_fasta_regex.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) mode change 100644 => 100755 bin/filter_fasta_regex.py diff --git a/bin/filter_fasta_regex.py b/bin/filter_fasta_regex.py old mode 100644 new mode 100755 index 97cfb764..ebded98a --- a/bin/filter_fasta_regex.py +++ b/bin/filter_fasta_regex.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import argparse import re @@ -39,6 +41,4 @@ def filter_fasta_with_regex(fasta_to_filter, out_fasta, regex): ) args = parser.parse_args() - filter_fasta_with_regex( - parser.input_fasta, parser.output_fasta, parser.regex_pattern - ) + filter_fasta_with_regex(args.input_fasta, args.output_fasta, args.regex_pattern) From 19e9801ff2bd2f8c475d0f505554d6252641af34 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 08:49:06 -0800 Subject: [PATCH 08/43] Update test params for download refseq --- conf/test_download_refseq.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/test_download_refseq.config b/conf/test_download_refseq.config index a7db4d8d..cebb0acf 100644 --- a/conf/test_download_refseq.config +++ b/conf/test_download_refseq.config @@ -35,5 +35,6 @@ params { // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/ // Protein fasta is 453 B - refseq_taxonomy = 'other' + refseq_taxonomy = 'vertebrate_mammalian' + test_mini_refseq_download = true } From a8616105553a6bed23794b4d1e0ecfd65afe6c52 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 08:50:01 -0800 Subject: [PATCH 09/43] Add Rsync, return gxx linux --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 37c5b912..685d964e 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - conda-forge::pymdown-extensions=6.0 - conda-forge::pygments=2.5.2 - conda-forge::tqdm=4.43.0 - # - conda-forge::gxx_linux-64=7.3.0 + - conda-forge::gxx_linux-64=7.3.0 - conda-forge::s3fs=0.4.2 - bioconda::sourmash=3.5.0 - bioconda::samtools=1.10 @@ -35,6 +35,7 @@ dependencies: - jupyter=1.0.0 - ripgrep=12.1.1 - conda-forge::rust=1.48.0 + - rsync=3.2.3 - pip: - bam2fasta==1.0.8 - sencha==1.0.3 \ No newline at end of file From caf44503e6c2cd124a430c3ce307bf10a7cfa7b4 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 08:50:12 -0800 Subject: [PATCH 10/43] Add osx environment yml --- environment_osx.yml | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 environment_osx.yml diff --git a/environment_osx.yml b/environment_osx.yml new file mode 100644 index 00000000..6de5700e --- /dev/null +++ b/environment_osx.yml @@ -0,0 +1,41 @@ +# You can use this file to create a conda environment for this pipeline: +# conda env create -f environment.yml +name: nf-core-kmermaid-0.1.0dev +channels: + - bioconda + - conda-forge + - defaults + - anaconda +dependencies: + - conda-forge::python=3.7.3 + - conda-forge::markdown=3.1.1 + - conda-forge::pymdown-extensions=6.0 + - conda-forge::pygments=2.5.2 + - conda-forge::tqdm=4.43.0 + - conda-forge::clangxx_osx-64=11.1.0 + - conda-forge::s3fs=0.4.2 + - bioconda::sourmash=3.5.0 + - bioconda::samtools=1.10 + - bioconda::screed=1.0.4 + - bioconda::khmer=3.0.0a3 + - bioconda::pysam=0.16.0 + - anaconda::make=4.2.1 + - alabaster=0.7.12 + - fastp=0.20.0 + - fastqc=0.11.9 + - matplotlib=3.1.1 # don't upgrade, multiqc conflict + - multiqc=1.8 + - numpy=1.17.5 + - pathos=0.2.5 + - pip=20.0.2 + - pytest=5.3.4 + - seqtk=1.3 + - ska=1.0 + - sphinx=2.3.1 + - jupyter=1.0.0 + - ripgrep=12.1.1 + - conda-forge::rust=1.48.0 + - rsync=3.2.3 + - pip: + - bam2fasta==1.0.8 + - sencha==1.0.3 \ No newline at end of file From 5a56dde47d3c5d210a2fd340b44c1eaac2ef47a0 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 08:54:23 -0800 Subject: [PATCH 11/43] download refseq and filter fasta is working --- main.nf | 131 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 104 insertions(+), 27 deletions(-) diff --git a/main.nf b/main.nf index c0d1900c..165dc693 100644 --- a/main.nf +++ b/main.nf @@ -1342,31 +1342,32 @@ if (!params.skip_remove_housekeeping_genes) { if (need_refseq_download){ // No fastas provided for removing housekeeping genes process download_refseq { - tag "${refseq_taxonomy}" + tag "${refseq_taxonomy}--${refseq_moltype}" label "process_low" - publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy' + publishDir "${params.outdir}/reference/ncbi_refseq/", mode: 'copy' input: val refseq_moltype from ch_refseq_moltypes_to_download output: - set val(refseq_moltype), file(output_fasta) into ch_refseq_fasta_to_filter + set val(refseq_moltype), file("${refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter script: output_fasta = "${refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz" + include_fasta = params.test_mini_refseq_download ? "${refseq_taxonomy}.1.${refseq_moltype}.f*a.gz" : "*${refseq_moltype}.f*a.gz" """ rsync \\ --prune-empty-dirs \\ --archive \\ --verbose \\ --recursive \\ - --include '*${refseq_moltype}.faa.gz' \\ + --include '${include_fasta}' \\ --exclude '/*' \\ rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${refseq_taxonomy}/ . wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER DATE=\$(date +'%Y-%M-%d') RELEASE_NUMBER=\$(cat RELEASE_NUMBER) - zcat *.${refseq_moltype}.faa.gz | gzip -c - > ${output_fasta} + zcat ${refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} """ } } @@ -1379,33 +1380,109 @@ if (!params.skip_remove_housekeeping_genes) { /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// /* - * STEP 7 - Filter fastas from refseq + * STEP 7 - Get only housekeeping genes from */ - if (need_refseq_download){ - // No fastas provided for removing housekeeping genes - process filter_fasta_housekeeping { - tag "${refseq_taxonomy}" - label "process_low" - publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy' + // Keep genes whose names match housekeeping gene regular expression pattern + process extract_fasta_housekeeping { + tag "${refseq_moltype}" + label "process_low" + publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' - input: - set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter + input: + set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter - output: - set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta) + output: + set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta) - script: - output_fasta = "${fasta.basename}__only_housekeeping_genes.fa" - output_fasta_gz = "${fasta.basename}__only_housekeeping_genes.fa.gz" - """ - filter_fasta_regex.py \\ - --input-fasta ${fasta} \\ - --output-fasta ${output_fasta} \\ - --regex-pattern '${params.housekeeping_gene_regex}' - gzip -c ${output_fasta} > ${output_fasta_gz} - """ - } + script: + output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa" + output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz" + """ + filter_fasta_regex.py \\ + --input-fasta ${fasta} \\ + --output-fasta ${output_fasta} \\ + --regex-pattern '${params.housekeeping_gene_regex}' + gzip -c ${output_fasta} > ${output_fasta_gz} + """ + } + + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- COMPUTE HOUSEKEEPING GENE K-MER SIGNATURE -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* + * STEP 8 - Compute Housekeeping Gene K-mer Signature + */ + // No fastas provided for removing housekeeping genes + process compute_housekeeping_kmer_sig { + tag "${refseq_moltype}" + label "process_low" + publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' + + input: + set val(refseq_moltype), file(fasta) from ch_houskeeping_fasta + + output: + set val(sourmash_moltype), file(sig) into ch_houskeeping_sig + + script: + sourmash_moltype = refseq_moltype == "protein" ? "protein,dayhoff" : 'dna' + sketch_id = make_sketch_id(sourmash_moltype, params.ksizes, sketch_value, track_abundance, sketch_style) + + moltype_flags = refseq_moltype == "protein" ? '--protein --dayhoff' : '--dna' + track_abundance_flag = track_abundance ? '--track-abundance' : '' + sig_id = "${ch_houskeeping_fasta.baseName}__${sketch_id}" + sig = "${sig_id}.sig" + csv = "${sig_id}.csv" + """ + sourmash compute \\ + ${sketch_value_flag} \\ + --ksizes ${params.ksizes} \\ + ${moltype_flags} \\ + ${track_abundance_flag} \\ + --output ${sig} \\ + --name '${sample_id}' \\ + ${fasta} + sourmash sig describe --csv ${csv} ${sig} + """ } + + + // /////////////////////////////////////////////////////////////////////////////// + // /////////////////////////////////////////////////////////////////////////////// + // /* -- -- */ + // /* -- REMOVE K-MERS FROM HOUSEKEEPING GENES -- */ + // /* -- -- */ + // /////////////////////////////////////////////////////////////////////////////// + // /////////////////////////////////////////////////////////////////////////////// + // /* + // * STEP 9 - Remove housekeeping gene k-mers from single cells + // */ + // process subtract_houskeeping_kmers { + // tag "${refseq_taxonomy}" + // label "process_low" + // publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' + + // input: + // set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter + + // output: + // set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta) + + // script: + // output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa" + // output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz" + // """ + // filter_fasta_regex.py \\ + // --input-fasta ${fasta} \\ + // --output-fasta ${output_fasta} \\ + // --regex-pattern '${params.housekeeping_gene_regex}' + // gzip -c ${output_fasta} > ${output_fasta_gz} + // """ + // } } From 4d49662495be7e746d929a8e58271b59dbcce1cd Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 09:07:04 -0800 Subject: [PATCH 12/43] Add missing quote --- bin/validate_sketch_value.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/validate_sketch_value.py b/bin/validate_sketch_value.py index d96cff7f..497d92bd 100755 --- a/bin/validate_sketch_value.py +++ b/bin/validate_sketch_value.py @@ -20,7 +20,7 @@ def get_sketch_value(value, value_log2): if "," in value: logger.exception( f"Can only provide a single number to --sketch_num_hashes or" - f" --sketch_scaled. Provided '{value}" + f" --sketch_scaled. Provided '{value}'" ) sketch_value = int(value) else: From ed82230cf4d5aa303c5597578fe57a05b1f05923 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 09:07:22 -0800 Subject: [PATCH 13/43] Move merged sigs to view --- main.nf | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index a59b9162..dfdddaf1 100644 --- a/main.nf +++ b/main.nf @@ -1428,6 +1428,10 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ """ } + ch_sourmash_sketches_merged_to_view + .dump( tag: "ch_sourmash_sketches_to_view" ) + + } else if (!params.skip_compute) { sourmash_sketches_nucleotide .mix ( sourmash_sketches_peptide ) @@ -1544,6 +1548,9 @@ if (!params.skip_remove_housekeeping_genes) { publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' input: + val track_abundance + val sketch_value_parsed + val sketch_style_parsed set val(refseq_moltype), file(fasta) from ch_houskeeping_fasta output: @@ -1551,7 +1558,15 @@ if (!params.skip_remove_housekeeping_genes) { script: sourmash_moltype = refseq_moltype == "protein" ? "protein,dayhoff" : 'dna' - sketch_id = make_sketch_id(sourmash_moltype, params.ksizes, sketch_value, track_abundance, sketch_style) + sketch_id = make_sketch_id( + peptide_molecules_comma_separated, + params.ksizes, + sketch_value_parsed[0], + track_abundance, + sketch_style_parsed[0] + ) + + sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) moltype_flags = refseq_moltype == "protein" ? '--protein --dayhoff' : '--dna' track_abundance_flag = track_abundance ? '--track-abundance' : '' @@ -1654,8 +1669,6 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { // sourmash_sketches_peptide_for_compare // .mix ( sourmash_sketches_nucleotide_for_compare ) // .set { ch_sourmash_sketches_to_compare } - ch_sourmash_sketches_merged_to_view - .dump( tag: "ch_sourmash_sketches_to_view" ) ch_peptide_molecules_for_compare @@ -1667,6 +1680,8 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { .mix ( ch_sourmash_compare_params_peptide ) .set { ch_sourmash_compare_params_both } + ch_sourmash_sketches_merged = Channel.empty() + ch_sourmash_sketches_merged // Drop first index (index 0) which is the cell id // Drop the second index (index 1) which is the sketch id From c5a0aae7c3878f8ed0cb8db10d54d7aa1b970fad Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 09:20:14 -0800 Subject: [PATCH 14/43] Add test_download_refseq to ci.yml --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0a26f0ed..5e4b59a1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,6 +72,7 @@ jobs: - "test_bam --write_barcodes_meta_csv false" - "test_bam --barcodes_file false --rename_10x_barcodes false" - "test_bam --rename_10x_barcodes false" + - "test_download_refseq" - "test_fastas" - "test_protein_fastas" - "test_remove_ribo" From 7606bd246946f2080caa5660751c05fcda764e89 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 09:20:22 -0800 Subject: [PATCH 15/43] Update scrape_software_versions --- bin/scrape_software_versions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 551861e1..597a70f5 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -14,7 +14,8 @@ "SKA": ["v_ska.txt", r"SKA Version: (\S+)"], "htslib": ["v_samtools.txt", r"htslib (\S+)"], "Sourmash": ["v_sourmash.txt", r"sourmash (\S+)"], - "SortMeRNA": ["v_sortmerna.txt", r"SortMeRNA version (\S+),"], + "Rsync": ["v_rsync.txt", r"rsync version (\S+)"], + "Rsync (Protocol)": ["v_rsync.txt", r"protocol version (\S+)"], "orpheum": ["v_orpheum.txt", r"Version: (\S+)"], } results = OrderedDict() @@ -25,10 +26,11 @@ results["bam2fasta"] = 'N/A' results["fastp"] = 'N/A' results["htslib"] = 'N/A' +results["Rsync"] = 'N/A' +results["Rsync (Protocol)"] = 'N/A' results["Samtools"] = 'N/A' results["SKA"] = 'N/A' results["Sourmash"] = 'N/A' -results["SortMeRNA"] = 'N/A' results["orpheum"] = 'N/A' # Search each file using its regex From ba243e3f40b925bc764fd1bc2cc29f516dc58d12 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 09:20:34 -0800 Subject: [PATCH 16/43] Remove sortmerna from get_software_versions --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 24d44dc4..2c367a28 100644 --- a/main.nf +++ b/main.nf @@ -680,8 +680,8 @@ process get_software_versions { bam2fasta info &> v_bam2fasta.txt fastp --version &> v_fastp.txt samtools --version &> v_samtools.txt + rsync --version &> v_rsync.txt ska version &> v_ska.txt - sortmerna --version &> v_sortmerna.txt sourmash -v &> v_sourmash.txt pip show orpheum &> v_orpheum.txt scrape_software_versions.py &> software_versions_mqc.yaml From dbf824f1a3cd7777c8edc6c517e9f283ea37d536 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 09:20:50 -0800 Subject: [PATCH 17/43] Fix sketch params in test_download_refseq --- conf/test_download_refseq.config | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/conf/test_download_refseq.config b/conf/test_download_refseq.config index cebb0acf..2d7b5486 100644 --- a/conf/test_download_refseq.config +++ b/conf/test_download_refseq.config @@ -14,16 +14,9 @@ params { max_cpus = 2 max_memory = 6.GB max_time = 48.h + + sketch_scaled = 2 // Input data - // samples = 'testing/samples.csv' - // fastas = 'testing/fastas/*.fasta' - ksizes = '3,9' - sketch_num_hashes_log2 = '2,4' - molecules = 'dna,protein,dayhoff' - // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' - // sra = "SRP016501" - remove_ribo_rna = true - save_non_rrna_reads = true input_paths = [ ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']], From 7b5cc3d14ac30e774a50ab5dab3776935c991305 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 10 Mar 2021 14:57:14 -0800 Subject: [PATCH 18/43] Got subtract to work!! --- main.nf | 164 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 116 insertions(+), 48 deletions(-) diff --git a/main.nf b/main.nf index 2c367a28..a6c91209 100644 --- a/main.nf +++ b/main.nf @@ -441,9 +441,10 @@ save_translate_json = params.save_translate_json // --- Parse the Sourmash parameters ---- ksizes = params.ksizes?.toString().tokenize(',') Channel.from(params.ksizes?.toString().tokenize(',')) - .into { ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide } + .into { ch_ksizes_for_nucleotide; ch_ksizes_for_peptide; ch_ksizes_for_compare_peptide; ch_ksizes_for_compare_nucleotide } molecules = params.molecules?.toString().tokenize(',') +nucleotide_molecules = molecules.findAll { it == "dna" } peptide_molecules = molecules.findAll { it != "dna" } peptide_molecules_comma_separated = peptide_molecules.join(",") peptide_molecule_flags = peptide_molecules.collect { it -> "--${it}" }.join ( " " ) @@ -451,8 +452,22 @@ peptide_molecule_flags = peptide_molecules.collect { it -> "--${it}" }.join ( " Channel.from( molecules ) .set { ch_molecules } +Channel.from( nucleotide_molecules ) + .into { ch_nucleotide_molecules; ch_nucleotide_molecules_for_subtract; ch_nucleotide_molecules_for_compare } + Channel.from( peptide_molecules ) - .into { ch_peptide_molecules; ch_peptide_molecules_for_compare } + .into { ch_peptide_molecules; ch_peptide_molecules_for_subtract; ch_peptide_molecules_for_compare } + + +ch_peptide_molecules + .combine( ch_ksizes_for_peptide ) + .set { ch_sourmash_params_peptide } + +ch_nucleotide_molecules + .combine( ch_ksizes_for_nucleotide ) + .mix ( ch_sourmash_params_peptide ) + .dump ( tag: 'ch_sourmash_params' ) + .into { ch_sourmash_params_for_compare ; ch_sourmash_params_for_subtract } // Parse sketch value and style parameters sketch_num_hashes = params.sketch_num_hashes @@ -684,6 +699,7 @@ process get_software_versions { ska version &> v_ska.txt sourmash -v &> v_sourmash.txt pip show orpheum &> v_orpheum.txt + python --version &> v_python.txt scrape_software_versions.py &> software_versions_mqc.yaml """ } @@ -1510,9 +1526,9 @@ if (!params.skip_remove_housekeeping_genes) { --exclude '/*' \\ rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${refseq_taxonomy}/ . wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER - DATE=\$(date +'%Y-%M-%d') + DATE=\$(date +'%Y-%m-%d') RELEASE_NUMBER=\$(cat RELEASE_NUMBER) - zcat ${refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} + gzcat ${refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} """ } } @@ -1529,7 +1545,7 @@ if (!params.skip_remove_housekeeping_genes) { */ // Keep genes whose names match housekeeping gene regular expression pattern process extract_fasta_housekeeping { - tag "${refseq_moltype}" + tag "${fasta.baseName}" label "process_low" publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' @@ -1537,7 +1553,7 @@ if (!params.skip_remove_housekeeping_genes) { set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter output: - set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta) + set val(refseq_moltype), file(output_fasta_gz) into ch_housekeeping_fasta, ch_housekeeping_fasta_to_view script: output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa" @@ -1550,6 +1566,9 @@ if (!params.skip_remove_housekeeping_genes) { gzip -c ${output_fasta} > ${output_fasta_gz} """ } + + ch_housekeeping_fasta_to_view + .dump( tag: 'ch_housekeeping_fasta' ) /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// @@ -1563,7 +1582,7 @@ if (!params.skip_remove_housekeeping_genes) { */ // No fastas provided for removing housekeeping genes process compute_housekeeping_kmer_sig { - tag "${refseq_moltype}" + tag "${fasta.baseName}" label "process_low" publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' @@ -1571,26 +1590,30 @@ if (!params.skip_remove_housekeeping_genes) { val track_abundance val sketch_value_parsed val sketch_style_parsed - set val(refseq_moltype), file(fasta) from ch_houskeeping_fasta + set val(refseq_moltype), file(fasta) from ch_housekeeping_fasta output: - set val(sourmash_moltype), file(sig) into ch_houskeeping_sig + set val(sourmash_moltypes), file(sig) into ch_housekeeping_sig script: - sourmash_moltype = refseq_moltype == "protein" ? "protein,dayhoff" : 'dna' + is_protein = refseq_moltype == "protein" + sourmash_moltype = is_protein ? "protein,dayhoff" : 'dna' + sourmash_moltypes = tuple(sourmash_moltype.split(",")) sketch_id = make_sketch_id( - peptide_molecules_comma_separated, + sourmash_moltype, params.ksizes, sketch_value_parsed[0], track_abundance, sketch_style_parsed[0] ) - sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) - - moltype_flags = refseq_moltype == "protein" ? '--protein --dayhoff' : '--dna' + sketch_value_flag = make_sketch_value_flag( + sketch_style_parsed[0], + sketch_value_parsed[0] + ) + moltype_flags = is_protein ? '--protein --dayhoff --input-is-protein' : '--dna' track_abundance_flag = track_abundance ? '--track-abundance' : '' - sig_id = "${ch_houskeeping_fasta.baseName}__${sketch_id}" + sig_id = "${fasta.baseName}__${sketch_id}" sig = "${sig_id}.sig" csv = "${sig_id}.csv" """ @@ -1600,12 +1623,55 @@ if (!params.skip_remove_housekeeping_genes) { ${moltype_flags} \\ ${track_abundance_flag} \\ --output ${sig} \\ - --name '${sample_id}' \\ + --name '${fasta.baseName}' \\ ${fasta} sourmash sig describe --csv ${csv} ${sig} """ } - + + ch_sourmash_sketches_merged + // index 2: moltypes + // index 4: signature + .map { tuple( tuple(it[2].split(",")), it[4] ) } + .transpose() + .dump( tag: 'ch_sourmash_sketches_moltype_to_sig' ) + .groupTuple( by: 0 ) + .dump( tag: 'ch_sourmash_sketches_moltype_to_sig__groupTuple' ) + .set { ch_sourmash_sketches_moltype_to_sigs } + + ch_housekeeping_sig + .dump( tag: 'ch_housekeeping_sig' ) + .transpose() + .dump( tag: 'ch_housekeeping_sig__transposed' ) + .combine( ch_sourmash_params_for_subtract, by: 0) + .dump( tag: 'ch_housekeeping_sig__transposed__combined' ) + .combine ( ch_sourmash_sketches_moltype_to_sigs, by: 0 ) + .dump( tag: 'ch_housekeeping_sig__transposed__combined_joined' ) + .into { ch_subtract_params_with_sigs; ch_subtract_params_to_sigs_for_siglist } + + ch_subtract_params_to_sigs_for_siglist + .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist' ) + .transpose() + .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose') + .collectFile() { it -> + [ "${it[0]}__${it[2]}.txt", "${it[3].getFileName()}\n"] + } + .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose__collectfile' ) + .map { [ tuple( it.baseName.split('__') ), it] } + .map { [ it[0][0], it[0][1], it[1] ] } + // .dump ( tag: 'ch_subtract_params_to_sigs_for_siglist__transpose__collectfile__map' ) + // .transpose() + .dump ( tag: 'ch_subtract_params_with_siglist' ) + .set { ch_subtract_params_with_siglist } + + ch_subtract_params_with_sigs + // Reorder so molecule (it[0]) and ksize (it[2]) are first + .map{ [it[0], it[2], it[1], it[3]] } + .dump ( tag: 'ch_subtract_params_with_sigs__map' ) + .combine( ch_subtract_params_with_siglist, by: [0, 1] ) + .dump( tag: 'ch_sigs_with_houskeeping_sig_to_subtract' ) + .set { ch_sigs_with_houskeeping_sig_to_subtract } + // /////////////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////////////////////// @@ -1617,28 +1683,38 @@ if (!params.skip_remove_housekeeping_genes) { // /* // * STEP 9 - Remove housekeeping gene k-mers from single cells // */ - // process subtract_houskeeping_kmers { - // tag "${refseq_taxonomy}" - // label "process_low" - // publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' - - // input: - // set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter - - // output: - // set val(refseq_moltype), file(output_fasta_gz) into (ch_houskeeping_fasta) - - // script: - // output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa" - // output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz" - // """ - // filter_fasta_regex.py \\ - // --input-fasta ${fasta} \\ - // --output-fasta ${output_fasta} \\ - // --regex-pattern '${params.housekeeping_gene_regex}' - // gzip -c ${output_fasta} > ${output_fasta_gz} - // """ - // } + process subtract_houskeeping_kmers { + tag "${subtract_id}" + label "process_medium" + publishDir "${params.outdir}/sketches_subtract_housekeeping_kmers/${subtract_id}", mode: 'copy' + + input: + val sketch_value_parsed + val sketch_style_parsed + set val(molecule), val(ksize), file(housekeeping_sig), file(sigs), file(siglist) from ch_sigs_with_houskeeping_sig_to_subtract + + output: + set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_houskeeping_removed + + script: + subtract_id = "${molecule}__k-${ksize}" + sketch_value_flag = make_sketch_value_flag( + sketch_style_parsed[0], + sketch_value_parsed[0] + ) + track_abundance_flag = track_abundance ? '--track-abundance' : '' + + """ + subtract \\ + ${track_abundance_flag} \\ + ${sketch_value_flag} \\ + --ksize ${ksize} \\ + --encoding ${molecule} \\ + --output subtracted/ \\ + ${housekeeping_sig} \\ + ${siglist} + """ + } } @@ -1692,14 +1768,6 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { // ch_sourmash_sketches_to_compare = Channel.empty() - ch_peptide_molecules_for_compare - .combine( ch_ksizes_for_compare_peptide ) - .set { ch_sourmash_compare_params_peptide } - - Channel.from("dna") - .combine( ch_ksizes_for_compare_nucleotide ) - .mix ( ch_sourmash_compare_params_peptide ) - .set { ch_sourmash_compare_params_both } ch_sourmash_sketches_merged = Channel.empty() @@ -1713,7 +1781,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { .transpose() .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' ) // Perform cartesian product on the molecules with compare params - .combine( ch_sourmash_compare_params_both, by: 0) + .combine( ch_sourmash_params_for_compare, by: 0) .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' ) .groupTuple(by: [0, 2]) .dump(tag: 'ch_sourmash_sketches_to_compare' ) From 71695fc8f0aafc956a34f71cacf9a80df33f6691 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 14:03:00 -0800 Subject: [PATCH 19/43] Use mamba to install packages --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e516001e..880a58aa 100755 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,8 @@ LABEL authors="Olga Botvinnik" \ # Install the conda environment COPY environment.yml / -RUN conda env create --quiet -f /environment.yml && conda clean -a +RUN conda install -c conda-forge mamba +RUN mamba env create -f /environment.yml && mamba clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH From ed266eb005e9716cbffe1c5a351d9fd240398969 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 14:03:21 -0800 Subject: [PATCH 20/43] Move Rust to conda-forge section --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 8650d4d2..d6956e37 100644 --- a/environment.yml +++ b/environment.yml @@ -14,6 +14,7 @@ dependencies: - conda-forge::tqdm=4.43.0 - conda-forge::gxx_linux-64=7.3.0 - conda-forge::s3fs=0.4.2 + - conda-forge::rust=1.48.0 - bioconda::sourmash=3.5.0 - bioconda::samtools=1.10 - bioconda::screed=1.0.4 @@ -34,7 +35,6 @@ dependencies: - sphinx=2.3.1 - jupyter=1.0.0 - ripgrep=12.1.1 - - conda-forge::rust=1.48.0 - rsync=3.2.3 - pip: - bam2fasta==1.0.8 From 87082ac67bce66501c80247a9cce498de9363678 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 14:03:30 -0800 Subject: [PATCH 21/43] Set sketch_scaled to 10 by default --- bin/scrape_software_versions.py | 4 ++- conf/base.config | 1 + conf/test_download_refseq.config | 2 +- ...test_housekeeping_from_filter_fasta.config | 33 +++++++++++++++++++ conf/test_housekeeping_from_make_sig.config | 33 +++++++++++++++++++ .../test_housekeeping_from_premade_sig.config | 33 +++++++++++++++++++ nextflow.config | 4 ++- siglist.txt | 4 +++ 8 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 conf/test_housekeeping_from_filter_fasta.config create mode 100644 conf/test_housekeeping_from_make_sig.config create mode 100644 conf/test_housekeeping_from_premade_sig.config create mode 100644 siglist.txt diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 597a70f5..80f39c09 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -17,6 +17,7 @@ "Rsync": ["v_rsync.txt", r"rsync version (\S+)"], "Rsync (Protocol)": ["v_rsync.txt", r"protocol version (\S+)"], "orpheum": ["v_orpheum.txt", r"Version: (\S+)"], + "Python": ["v_python.txt", r"Python (\S+)"], } results = OrderedDict() results["nf-core/kmermaid"] = 'N/A' @@ -26,12 +27,13 @@ results["bam2fasta"] = 'N/A' results["fastp"] = 'N/A' results["htslib"] = 'N/A' +results["orpheum"] = 'N/A' +results["Python"] = 'N/A' results["Rsync"] = 'N/A' results["Rsync (Protocol)"] = 'N/A' results["Samtools"] = 'N/A' results["SKA"] = 'N/A' results["Sourmash"] = 'N/A' -results["orpheum"] = 'N/A' # Search each file using its regex for k, v in regexes.items(): diff --git a/conf/base.config b/conf/base.config index 07a2aa3b..01e0ffa3 100644 --- a/conf/base.config +++ b/conf/base.config @@ -54,6 +54,7 @@ process { withName: 'multiqc|get_software_versions' { memory = { check_max( 2.GB * task.attempt, 'memory' ) } + errorStrategy = "ignore" cache = false } withName: 'sourmash_compute_sketch_fastx_nucleotide|sourmash_compute_sketch_fastx_peptide' { diff --git a/conf/test_download_refseq.config b/conf/test_download_refseq.config index 2d7b5486..3f624b84 100644 --- a/conf/test_download_refseq.config +++ b/conf/test_download_refseq.config @@ -15,7 +15,7 @@ params { max_memory = 6.GB max_time = 48.h - sketch_scaled = 2 + // sketch_scaled = 2 // Input data input_paths = [ ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', diff --git a/conf/test_housekeeping_from_filter_fasta.config b/conf/test_housekeeping_from_filter_fasta.config new file mode 100644 index 00000000..2d7b5486 --- /dev/null +++ b/conf/test_housekeeping_from_filter_fasta.config @@ -0,0 +1,33 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/kmermaid -profile test + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + sketch_scaled = 2 + // Input data + input_paths = [ + ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']], + ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']], + ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], + ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], + ] + + // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/ + // Protein fasta is 453 B + refseq_taxonomy = 'vertebrate_mammalian' + test_mini_refseq_download = true +} diff --git a/conf/test_housekeeping_from_make_sig.config b/conf/test_housekeeping_from_make_sig.config new file mode 100644 index 00000000..2d7b5486 --- /dev/null +++ b/conf/test_housekeeping_from_make_sig.config @@ -0,0 +1,33 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/kmermaid -profile test + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + sketch_scaled = 2 + // Input data + input_paths = [ + ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']], + ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']], + ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], + ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], + ] + + // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/ + // Protein fasta is 453 B + refseq_taxonomy = 'vertebrate_mammalian' + test_mini_refseq_download = true +} diff --git a/conf/test_housekeeping_from_premade_sig.config b/conf/test_housekeeping_from_premade_sig.config new file mode 100644 index 00000000..2d7b5486 --- /dev/null +++ b/conf/test_housekeeping_from_premade_sig.config @@ -0,0 +1,33 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/kmermaid -profile test + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + sketch_scaled = 2 + // Input data + input_paths = [ + ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']], + ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz', + 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']], + ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], + ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], + ] + + // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/ + // Protein fasta is 453 B + refseq_taxonomy = 'vertebrate_mammalian' + test_mini_refseq_download = true +} diff --git a/nextflow.config b/nextflow.config index 96496b34..41b72730 100644 --- a/nextflow.config +++ b/nextflow.config @@ -34,7 +34,7 @@ params { // Number of hashes from each sample sketch_num_hashes = false sketch_num_hashes_log2 = false - sketch_scaled = false + sketch_scaled = 10 sketch_scaled_log2 = false skip_sig_merge = false @@ -66,6 +66,8 @@ params { test_mini_refseq_download = false housekeeping_protein_fasta = false housekeeping_rna_fasta = false + housekeeping_protein_sig = false + housekeeping_rna_sig = false // ska options split_kmer = false diff --git a/siglist.txt b/siglist.txt new file mode 100644 index 00000000..43fc713a --- /dev/null +++ b/siglist.txt @@ -0,0 +1,4 @@ +SRR4238351__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig +SRR4238355__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig +SRR4050380__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig +SRR4050379__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig From 76c32b10f0ae6c8a9e40afccc1b430219bad814b Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 14:14:05 -0800 Subject: [PATCH 22/43] reference_proteome_fasta --> translate_proteome_fasta --- main.nf | 26 +++++++++++++------------- nextflow.config | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/main.nf b/main.nf index a6c91209..147b5a3e 100644 --- a/main.nf +++ b/main.nf @@ -120,7 +120,7 @@ def helpMessage() { to new name, e.g. with channel or cell annotation label Translate RNA-seq reads into protein-coding sequences options: - --reference_proteome_fasta Path to a well-curated fasta file of protein sequences. Used to filter for coding reads + --translate_proteome_fasta Path to a well-curated fasta file of protein sequences. Used to filter for coding reads --translate_peptide_ksize K-mer size to use for translating RNA into protein. Default: 9, which is good for 'protein'. If using dayhoff, suggest 15 --translate_peptide_molecule Which molecular encoding to use for translating. Default: "protein" @@ -324,10 +324,10 @@ if (params.protein_fastas){ ch_protein_fastas = Channel.empty() } -if (params.reference_proteome_fasta) { -Channel.fromPath(params.reference_proteome_fasta, checkIfExists: true) - .ifEmpty { exit 1, "Reference proteome file not found: ${params.reference_proteome_fasta}" } - .set{ ch_reference_proteome_fasta } +if (params.translate_proteome_fasta) { +Channel.fromPath(params.translate_proteome_fasta, checkIfExists: true) + .ifEmpty { exit 1, "Reference proteome file not found: ${params.translate_proteome_fasta}" } + .set{ ch_translate_proteome_fasta } } //////////////////////////////////////////////////// @@ -620,10 +620,10 @@ if(params.tenx_tgz) summary["10x Cell pattern"] = params.tenx_cell_barcode_patte if(params.tenx_tgz) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern if(params.tenx_tgz) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell // Extract coding parameters -if(params.reference_proteome_fasta) summary["Peptide fasta"] = params.reference_proteome_fasta -if(params.reference_proteome_fasta) summary['Peptide ksize'] = params.translate_peptide_ksize -if(params.reference_proteome_fasta) summary['Peptide molecule'] = params.translate_peptide_molecule -if(params.reference_proteome_fasta) summary['Bloom filter table size'] = params.bloomfilter_tablesize +if(params.translate_proteome_fasta) summary["Peptide fasta"] = params.translate_proteome_fasta +if(params.translate_proteome_fasta) summary['Peptide ksize'] = params.translate_peptide_ksize +if(params.translate_proteome_fasta) summary['Peptide molecule'] = params.translate_peptide_molecule +if(params.translate_proteome_fasta) summary['Bloom filter table size'] = params.bloomfilter_tablesize // Resource information summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" @@ -770,7 +770,7 @@ if ( !params.split_kmer && have_sketch_value ) { -if (params.reference_proteome_fasta){ +if (params.translate_proteome_fasta){ process make_protein_index { tag "${peptides}__${bloom_id}" label "low_memory" @@ -778,7 +778,7 @@ if (params.reference_proteome_fasta){ publishDir "${params.outdir}/protein_index", mode: params.publish_dir_mode input: - file(peptides) from ch_reference_proteome_fasta + file(peptides) from ch_translate_proteome_fasta translate_peptide_ksize translate_peptide_molecule @@ -1115,7 +1115,7 @@ if (params.subsample) { } - if (params.reference_proteome_fasta){ + if (params.translate_proteome_fasta){ process translate { tag "${sample_id}" label "low_memory_long" @@ -1309,7 +1309,7 @@ if (!have_nucleotide_input) { } -if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ +if (!params.skip_compute && (protein_input || params.translate_proteome_fasta)){ process sourmash_compute_sketch_fastx_peptide { tag "${sig_id}" diff --git a/nextflow.config b/nextflow.config index 41b72730..bdf0200b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -51,7 +51,7 @@ params { translate_peptide_ksize = 8 translate_peptide_molecule = 'protein' translate_jaccard_threshold = 0.05 - reference_proteome_fasta = false + translate_proteome_fasta = false bloomfilter_tablesize = '1e8' // Saving the translate results for each dataset makes it take extra long // Recommended for debugging purposes only From e4154cfe2d00d0b5c6ba26f03b5872f71eee5869 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 14:31:24 -0800 Subject: [PATCH 23/43] Use my branch of the rust sourmash remove code --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 880a58aa..2a407b08 100755 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0dev.yml # Install super fast rust code to remove nuisance hashes (e.g. ribosomal) from signatures -RUN git clone https://github.com/luizirber/2021-01-27-olga-remove-protein/ +RUN git clone https://github.com/olgabot/2021-01-27-olga-remove-protein.git@olgabot/mut-warning RUN cd 2021-01-27-olga-remove-protein && cargo build --release # Add "subtract" command to path ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH From 94d5f2c9183ed2824f9dbe99ac0026d50dc96ded Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 14:31:32 -0800 Subject: [PATCH 24/43] Add cmake to help with gcc building --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index d6956e37..093d9644 100644 --- a/environment.yml +++ b/environment.yml @@ -7,6 +7,7 @@ channels: - defaults - anaconda dependencies: + - conda-forge::cmake=3.19.6 - conda-forge::python=3.7.3 - conda-forge::markdown=3.1.1 - conda-forge::pymdown-extensions=6.0 From c7d603da93f2359c3263964d0f799f40f5987195 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 15:00:12 -0800 Subject: [PATCH 25/43] Get housekeeping removal from sig, fasta to work --- main.nf | 246 +++++++++++++++++++++++++++--------------------- nextflow.config | 6 +- 2 files changed, 145 insertions(+), 107 deletions(-) diff --git a/main.nf b/main.nf index 147b5a3e..10fae059 100644 --- a/main.nf +++ b/main.nf @@ -544,18 +544,45 @@ else { housekeeping_protein_fasta = params.housekeeping_protein_fasta housekeeping_rna_fasta = params.housekeeping_rna_fasta -need_refseq_download = !housekeeping_protein_fasta && !housekeeping_rna_fasta - -ch_refseq_moltype_to_fasta = Channel.from(["protein", housekeeping_protein_fasta], ["rna", housekeeping_rna_fasta]) -ch_refseq_moltype_to_fasta - // filter if the second item, the fasta is false - .filter{ !it[1] } +housekeeping_protein_sig = params.housekeeping_protein_sig +housekeeping_rna_sig = params.housekeeping_rna_sig + +have_housekeeping_fastas = housekeeping_protein_fasta && housekeeping_rna_fasta +have_housekeeping_sigs = housekeeping_protein_sig && housekeeping_rna_sig +need_refseq_download = (!have_housekeeping_fastas) && (!have_housekeeping_sigs) + +if (have_housekeeping_fastas) { + Channel.from( + ["protein", file(housekeeping_protein_fasta)], + ["rna", file(housekeeping_rna_fasta)]) + .into { ch_housekeeping_fasta; ch_refseq_moltype_to_fasta } + + ch_refseq_moltype_to_fasta + // Check if protein molecules were even specified + .filter{ + it[0] == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 + } // Take only the first item, the molecule type .map{ it[0] } .set{ ch_refseq_moltypes_to_download } +} + +if (have_housekeeping_sigs) { + // Use sourmash moltypes of "protein,dayhoff" instead of the original protein + // as used for the fastas as that's what matches the sourmash outputs + ch_housekeeping_sig = Channel.from( + ["protein,dayhoff", file(housekeeping_protein_sig)], + ["dna", file(housekeeping_rna_sig)] + ) +} + // Parse refseq taxonomy group to download -refseq_taxonomy = params.refseq_taxonomy +housekeeping_refseq_taxonomy = params.housekeeping_refseq_taxonomy +///////////////////////////////////////////////////////////// +/* -- END: Parse Housekeeping K-mer removal parameters -- */ +///////////////////////////////////////////////////////////// + // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name @@ -619,11 +646,17 @@ if(params.tenx_tgz) summary["10x SAM tags"] = params.tenx_tags if(params.tenx_tgz) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern if(params.tenx_tgz) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern if(params.tenx_tgz) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell -// Extract coding parameters -if(params.translate_proteome_fasta) summary["Peptide fasta"] = params.translate_proteome_fasta -if(params.translate_proteome_fasta) summary['Peptide ksize'] = params.translate_peptide_ksize -if(params.translate_proteome_fasta) summary['Peptide molecule'] = params.translate_peptide_molecule -if(params.translate_proteome_fasta) summary['Bloom filter table size'] = params.bloomfilter_tablesize +// Orpheum Translate parameters +if(params.translate_proteome_fasta) summary["Orpheum Translate Peptide fasta"] = params.translate_proteome_fasta +if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide ksize'] = params.translate_peptide_ksize +if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide molecule'] = params.translate_peptide_molecule +if(params.translate_proteome_fasta) summary['Oprheum Translate Bloom filter table size'] = params.bloomfilter_tablesize +// Housekeeping k-mer removal paramters +if(params.housekeeping_protein_fasta) summary["Housekeping Peptide fasta"] = params.housekeeping_protein_fasta +if(params.housekeeping_rna_fasta) summary["Housekeping RNA fasta"] = params.housekeeping_rna_fasta +if(params.housekeeping_protein_sig) summary["Housekeping Peptide K-mer Signature"] = params.housekeeping_protein_sig +if(params.housekeeping_rna_sig) summary["Housekeping RNA K-mer Signature"] = params.housekeeping_rna_sig +if(need_refseq_download) summary["Housekeeping Refseq Taxonomy"] = params.housekeeping_refseq_taxonomy // Resource information summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" @@ -1503,7 +1536,7 @@ if (!params.skip_remove_housekeeping_genes) { if (need_refseq_download){ // No fastas provided for removing housekeeping genes process download_refseq { - tag "${refseq_taxonomy}--${refseq_moltype}" + tag "${housekeeping_refseq_taxonomy}--${refseq_moltype}" label "process_low" publishDir "${params.outdir}/reference/ncbi_refseq/", mode: 'copy' @@ -1511,11 +1544,11 @@ if (!params.skip_remove_housekeeping_genes) { val refseq_moltype from ch_refseq_moltypes_to_download output: - set val(refseq_moltype), file("${refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter + set val(refseq_moltype), file("${housekeeping_refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter script: - output_fasta = "${refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz" - include_fasta = params.test_mini_refseq_download ? "${refseq_taxonomy}.1.${refseq_moltype}.f*a.gz" : "*${refseq_moltype}.f*a.gz" + output_fasta = "${housekeeping_refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz" + include_fasta = params.test_mini_refseq_download ? "${housekeeping_refseq_taxonomy}.1.${refseq_moltype}.f*a.gz" : "*${refseq_moltype}.f*a.gz" """ rsync \\ --prune-empty-dirs \\ @@ -1524,111 +1557,114 @@ if (!params.skip_remove_housekeeping_genes) { --recursive \\ --include '${include_fasta}' \\ --exclude '/*' \\ - rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${refseq_taxonomy}/ . + rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${housekeeping_refseq_taxonomy}/ . wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER DATE=\$(date +'%Y-%m-%d') RELEASE_NUMBER=\$(cat RELEASE_NUMBER) - gzcat ${refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} + gzcat ${housekeeping_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} """ } - } - /////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////// - /* -- -- */ - /* -- REMOVE K-MERS FROM HOUSEKEEPING GENES -- */ - /* -- -- */ - /////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////// - /* - * STEP 7 - Get only housekeeping genes from - */ - // Keep genes whose names match housekeeping gene regular expression pattern - process extract_fasta_housekeeping { - tag "${fasta.baseName}" - label "process_low" - publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- REMOVE K-MERS FROM HOUSEKEEPING GENES -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* + * STEP 7 - Get only housekeeping genes from + */ + // Keep genes whose names match housekeeping gene regular expression pattern + process extract_fasta_housekeeping { + tag "${fasta.baseName}" + label "process_low" + publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' - input: - set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter + input: + set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter - output: - set val(refseq_moltype), file(output_fasta_gz) into ch_housekeeping_fasta, ch_housekeeping_fasta_to_view + output: + set val(refseq_moltype), file(output_fasta_gz) into ch_housekeeping_fasta, ch_housekeeping_fasta_to_view - script: - output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa" - output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz" - """ - filter_fasta_regex.py \\ - --input-fasta ${fasta} \\ - --output-fasta ${output_fasta} \\ - --regex-pattern '${params.housekeeping_gene_regex}' - gzip -c ${output_fasta} > ${output_fasta_gz} - """ + script: + output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa" + output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz" + """ + filter_fasta_regex.py \\ + --input-fasta ${fasta} \\ + --output-fasta ${output_fasta} \\ + --regex-pattern '${params.housekeeping_gene_regex}' + gzip -c ${output_fasta} > ${output_fasta_gz} + """ + } + + ch_housekeeping_fasta_to_view + .dump( tag: 'ch_housekeeping_fasta' ) } - - ch_housekeeping_fasta_to_view - .dump( tag: 'ch_housekeeping_fasta' ) - - /////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////// - /* -- -- */ - /* -- COMPUTE HOUSEKEEPING GENE K-MER SIGNATURE -- */ - /* -- -- */ - /////////////////////////////////////////////////////////////////////////////// - /////////////////////////////////////////////////////////////////////////////// - /* - * STEP 8 - Compute Housekeeping Gene K-mer Signature - */ - // No fastas provided for removing housekeeping genes - process compute_housekeeping_kmer_sig { - tag "${fasta.baseName}" - label "process_low" - publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' - input: - val track_abundance - val sketch_value_parsed - val sketch_style_parsed - set val(refseq_moltype), file(fasta) from ch_housekeeping_fasta + if (!have_housekeeping_sigs) { + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* -- -- */ + /* -- COMPUTE HOUSEKEEPING GENE K-MER SIGNATURE -- */ + /* -- -- */ + /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////// + /* + * STEP 8 - Compute Housekeeping Gene K-mer Signature + */ + // No fastas provided for removing housekeeping genes + process compute_housekeeping_kmer_sig { + tag "${fasta.baseName}" + label "process_low" + publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' - output: - set val(sourmash_moltypes), file(sig) into ch_housekeeping_sig + input: + val track_abundance + val sketch_value_parsed + val sketch_style_parsed + set val(refseq_moltype), file(fasta) from ch_housekeeping_fasta - script: - is_protein = refseq_moltype == "protein" - sourmash_moltype = is_protein ? "protein,dayhoff" : 'dna' - sourmash_moltypes = tuple(sourmash_moltype.split(",")) - sketch_id = make_sketch_id( - sourmash_moltype, - params.ksizes, - sketch_value_parsed[0], - track_abundance, - sketch_style_parsed[0] - ) + output: + set val(sourmash_moltypes), file(sig) into ch_housekeeping_sig - sketch_value_flag = make_sketch_value_flag( - sketch_style_parsed[0], - sketch_value_parsed[0] - ) - moltype_flags = is_protein ? '--protein --dayhoff --input-is-protein' : '--dna' - track_abundance_flag = track_abundance ? '--track-abundance' : '' - sig_id = "${fasta.baseName}__${sketch_id}" - sig = "${sig_id}.sig" - csv = "${sig_id}.csv" - """ - sourmash compute \\ - ${sketch_value_flag} \\ - --ksizes ${params.ksizes} \\ - ${moltype_flags} \\ - ${track_abundance_flag} \\ - --output ${sig} \\ - --name '${fasta.baseName}' \\ - ${fasta} - sourmash sig describe --csv ${csv} ${sig} - """ + script: + is_protein = refseq_moltype == "protein" + sourmash_moltype = is_protein ? "protein,dayhoff" : 'dna' + sourmash_moltypes = tuple(sourmash_moltype.split(",")) + sketch_id = make_sketch_id( + sourmash_moltype, + params.ksizes, + sketch_value_parsed[0], + track_abundance, + sketch_style_parsed[0] + ) + + sketch_value_flag = make_sketch_value_flag( + sketch_style_parsed[0], + sketch_value_parsed[0] + ) + moltype_flags = is_protein ? '--protein --dayhoff --input-is-protein' : '--dna' + track_abundance_flag = track_abundance ? '--track-abundance' : '' + sig_id = "${fasta.baseName}__${sketch_id}" + sig = "${sig_id}.sig" + csv = "${sig_id}.csv" + """ + sourmash compute \\ + ${sketch_value_flag} \\ + --ksizes ${params.ksizes} \\ + ${moltype_flags} \\ + ${track_abundance_flag} \\ + --output ${sig} \\ + --name '${fasta.baseName}' \\ + ${fasta} + sourmash sig describe --csv ${csv} ${sig} + """ + } } + ch_sourmash_sketches_merged // index 2: moltypes // index 4: signature diff --git a/nextflow.config b/nextflow.config index bdf0200b..780e4b04 100644 --- a/nextflow.config +++ b/nextflow.config @@ -61,7 +61,7 @@ params { // Housekeeping gene k-mer removal skip_remove_housekeeping_genes = false housekeeping_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH" - refseq_taxonomy = 'vertebrate_mammalian' + housekeeping_refseq_taxonomy = 'vertebrate_mammalian' // For testing purposes --> use a small refseq dataset test_mini_refseq_download = false housekeeping_protein_fasta = false @@ -152,11 +152,13 @@ profiles { podman.enabled = true } test { includeConfig 'conf/test.config' } - test_download_refseq { includeConfig 'conf/test_download_refseq.config' } test_full { includeConfig 'conf/test_full.config' } test_ska { includeConfig 'conf/test_ska.config' } test_bam { includeConfig 'conf/test_bam.config' } test_fastas { includeConfig 'conf/test_fastas.config' } + test_housekeeping_from_download_refseq { includeConfig 'conf/test_housekeeping_from_download_refseq.config' } + test_housekeeping_from_fasta { includeConfig 'conf/test_housekeeping_from_fasta.config' } + test_housekeeping_from_sig { includeConfig 'conf/test_housekeeping_from_sig.config' } test_protein_fastas { includeConfig 'conf/test_protein_fastas.config' } test_remove_ribo { includeConfig 'conf/test_remove_ribo.config' } test_sig_merge { includeConfig 'conf/test_sig_merge.config' } From b65ebcdce7c3328f736b4147776ad95485ed9682 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 15:03:20 -0800 Subject: [PATCH 26/43] Update vital gene tests --- .github/workflows/ci.yml | 4 ++- ..._housekeeping_from_download_refseq.config} | 1 - ...ig => test_housekeeping_from_fasta.config} | 9 +++-- .../test_housekeeping_from_premade_sig.config | 33 ------------------- ...nfig => test_housekeeping_from_sig.config} | 8 ++--- 5 files changed, 9 insertions(+), 46 deletions(-) rename conf/{test_housekeeping_from_filter_fasta.config => test_housekeeping_from_download_refseq.config} (98%) rename conf/{test_housekeeping_from_make_sig.config => test_housekeeping_from_fasta.config} (68%) delete mode 100644 conf/test_housekeeping_from_premade_sig.config rename conf/{test_download_refseq.config => test_housekeeping_from_sig.config} (70%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5e4b59a1..76b2cbeb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,8 +72,10 @@ jobs: - "test_bam --write_barcodes_meta_csv false" - "test_bam --barcodes_file false --rename_10x_barcodes false" - "test_bam --rename_10x_barcodes false" - - "test_download_refseq" - "test_fastas" + - "test_housekeeping_from_download_refseq" + - "test_housekeeping_from_fasta" + - "test_housekeeping_from_sig" - "test_protein_fastas" - "test_remove_ribo" - "test_sig_merge" diff --git a/conf/test_housekeeping_from_filter_fasta.config b/conf/test_housekeeping_from_download_refseq.config similarity index 98% rename from conf/test_housekeeping_from_filter_fasta.config rename to conf/test_housekeeping_from_download_refseq.config index 2d7b5486..886a8424 100644 --- a/conf/test_housekeeping_from_filter_fasta.config +++ b/conf/test_housekeeping_from_download_refseq.config @@ -15,7 +15,6 @@ params { max_memory = 6.GB max_time = 48.h - sketch_scaled = 2 // Input data input_paths = [ ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', diff --git a/conf/test_housekeeping_from_make_sig.config b/conf/test_housekeeping_from_fasta.config similarity index 68% rename from conf/test_housekeeping_from_make_sig.config rename to conf/test_housekeeping_from_fasta.config index 2d7b5486..2bf6ba6d 100644 --- a/conf/test_housekeeping_from_make_sig.config +++ b/conf/test_housekeeping_from_fasta.config @@ -15,7 +15,6 @@ params { max_memory = 6.GB max_time = 48.h - sketch_scaled = 2 // Input data input_paths = [ ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', @@ -25,9 +24,9 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] + housekeeping_protein_fasta = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_housekeeping_genes.fa.gz" + housekeeping_rna_fasta = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_housekeeping_genes.fa.gz" - // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/ - // Protein fasta is 453 B - refseq_taxonomy = 'vertebrate_mammalian' - test_mini_refseq_download = true + reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa' + bloomfilter_tablesize = '1e6' } diff --git a/conf/test_housekeeping_from_premade_sig.config b/conf/test_housekeeping_from_premade_sig.config deleted file mode 100644 index 2d7b5486..00000000 --- a/conf/test_housekeeping_from_premade_sig.config +++ /dev/null @@ -1,33 +0,0 @@ -/* - * ------------------------------------------------- - * Nextflow config file for running tests - * ------------------------------------------------- - * Defines bundled input files and everything required - * to run a fast and simple test. Use as follows: - * nextflow run nf-core/kmermaid -profile test - */ - -params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on Travis - max_cpus = 2 - max_memory = 6.GB - max_time = 48.h - - sketch_scaled = 2 - // Input data - input_paths = [ - ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', - 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_2.fastq.gz']], - ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_1.fastq.gz', - 'https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050380_pass_2.fastq.gz']], - ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], - ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], - ] - - // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/ - // Protein fasta is 453 B - refseq_taxonomy = 'vertebrate_mammalian' - test_mini_refseq_download = true -} diff --git a/conf/test_download_refseq.config b/conf/test_housekeeping_from_sig.config similarity index 70% rename from conf/test_download_refseq.config rename to conf/test_housekeeping_from_sig.config index 3f624b84..21ed7073 100644 --- a/conf/test_download_refseq.config +++ b/conf/test_housekeeping_from_sig.config @@ -15,7 +15,6 @@ params { max_memory = 6.GB max_time = 48.h - // sketch_scaled = 2 // Input data input_paths = [ ['SRR4050379', ['https://github.com/nf-core/test-datasets/raw/kmermaid/testdata/SRR4050379_pass_1.fastq.gz', @@ -25,9 +24,6 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] - - // "Other" is the smallest refseq taxonomy subdirectory: ftp://ftp.ncbi.nlm.nih.gov/refseq/release/other/ - // Protein fasta is 453 B - refseq_taxonomy = 'vertebrate_mammalian' - test_mini_refseq_download = true + housekeeping_protein_sig = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_housekeeping_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + housekeeping_rna_sig = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_housekeeping_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" } From c22d7ee31e7c9b6fde2c32929caef359b535211d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 15:26:36 -0800 Subject: [PATCH 27/43] Soft link conda C libraries --- Dockerfile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2a407b08..4958bb1e 100755 --- a/Dockerfile +++ b/Dockerfile @@ -14,8 +14,11 @@ ENV PATH /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin:$PATH RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0dev.yml # Install super fast rust code to remove nuisance hashes (e.g. ribosomal) from signatures -RUN git clone https://github.com/olgabot/2021-01-27-olga-remove-protein.git@olgabot/mut-warning -RUN cd 2021-01-27-olga-remove-protein && cargo build --release +RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git +# Soft link all conda C-related libraries to their non-prefixed name +# for rust to be able to build the C libraries +RUN for f in $(ls $CONDA_PREFIX/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done +RUN cd 2021-01-27-olga-remove-protein && cargo build --release # Add "subtract" command to path ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH From 253a04d8c2803828d1c37fe59722c851cb1a59d8 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 11 Mar 2021 16:45:01 -0800 Subject: [PATCH 28/43] housekeeping --> constitutive --- Dockerfile | 2 +- ..._constitutive_from_download_refseq.config} | 0 ...ig => test_constitutive_from_fasta.config} | 0 ...nfig => test_constitutive_from_sig.config} | 4 +- main.nf | 124 +++++++++--------- nextflow.config | 22 ++-- 6 files changed, 76 insertions(+), 76 deletions(-) rename conf/{test_housekeeping_from_download_refseq.config => test_constitutive_from_download_refseq.config} (100%) rename conf/{test_housekeeping_from_fasta.config => test_constitutive_from_fasta.config} (100%) rename conf/{test_housekeeping_from_sig.config => test_constitutive_from_sig.config} (70%) diff --git a/Dockerfile b/Dockerfile index 4958bb1e..911e2f04 100755 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git # Soft link all conda C-related libraries to their non-prefixed name # for rust to be able to build the C libraries -RUN for f in $(ls $CONDA_PREFIX/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done +RUN for f in $(ls $CONDA_PREFIX/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done RUN cd 2021-01-27-olga-remove-protein && cargo build --release # Add "subtract" command to path ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH diff --git a/conf/test_housekeeping_from_download_refseq.config b/conf/test_constitutive_from_download_refseq.config similarity index 100% rename from conf/test_housekeeping_from_download_refseq.config rename to conf/test_constitutive_from_download_refseq.config diff --git a/conf/test_housekeeping_from_fasta.config b/conf/test_constitutive_from_fasta.config similarity index 100% rename from conf/test_housekeeping_from_fasta.config rename to conf/test_constitutive_from_fasta.config diff --git a/conf/test_housekeeping_from_sig.config b/conf/test_constitutive_from_sig.config similarity index 70% rename from conf/test_housekeeping_from_sig.config rename to conf/test_constitutive_from_sig.config index 21ed7073..12a88fdb 100644 --- a/conf/test_housekeeping_from_sig.config +++ b/conf/test_constitutive_from_sig.config @@ -24,6 +24,6 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] - housekeeping_protein_sig = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_housekeeping_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - housekeeping_rna_sig = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_housekeeping_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" } diff --git a/main.nf b/main.nf index 10fae059..992f95d8 100644 --- a/main.nf +++ b/main.nf @@ -539,23 +539,23 @@ else { ////////////////////////////////////////////////////////// -/* -- Parse Housekeeping K-mer removal parameters -- */ +/* -- Parse constitutive K-mer removal parameters -- */ ///////////////////////////////////////////////////////// -housekeeping_protein_fasta = params.housekeeping_protein_fasta -housekeeping_rna_fasta = params.housekeeping_rna_fasta +constitutive_protein_fasta = params.constitutive_protein_fasta +constitutive_rna_fasta = params.constitutive_rna_fasta -housekeeping_protein_sig = params.housekeeping_protein_sig -housekeeping_rna_sig = params.housekeeping_rna_sig +constitutive_protein_sig = params.constitutive_protein_sig +constitutive_rna_sig = params.constitutive_rna_sig -have_housekeeping_fastas = housekeeping_protein_fasta && housekeeping_rna_fasta -have_housekeeping_sigs = housekeeping_protein_sig && housekeeping_rna_sig -need_refseq_download = (!have_housekeeping_fastas) && (!have_housekeeping_sigs) +have_constitutive_fastas = constitutive_protein_fasta && constitutive_rna_fasta +have_constitutive_sigs = constitutive_protein_sig && constitutive_rna_sig +need_refseq_download = (!have_constitutive_fastas) && (!have_constitutive_sigs) -if (have_housekeeping_fastas) { +if (have_constitutive_fastas) { Channel.from( - ["protein", file(housekeeping_protein_fasta)], - ["rna", file(housekeeping_rna_fasta)]) - .into { ch_housekeeping_fasta; ch_refseq_moltype_to_fasta } + ["protein", file(constitutive_protein_fasta)], + ["rna", file(constitutive_rna_fasta)]) + .into { ch_constitutive_fasta; ch_refseq_moltype_to_fasta } ch_refseq_moltype_to_fasta // Check if protein molecules were even specified @@ -567,20 +567,20 @@ if (have_housekeeping_fastas) { .set{ ch_refseq_moltypes_to_download } } -if (have_housekeeping_sigs) { +if (have_constitutive_sigs) { // Use sourmash moltypes of "protein,dayhoff" instead of the original protein // as used for the fastas as that's what matches the sourmash outputs - ch_housekeeping_sig = Channel.from( - ["protein,dayhoff", file(housekeeping_protein_sig)], - ["dna", file(housekeeping_rna_sig)] + ch_constitutive_sig = Channel.from( + ["protein,dayhoff", file(constitutive_protein_sig)], + ["dna", file(constitutive_rna_sig)] ) } // Parse refseq taxonomy group to download -housekeeping_refseq_taxonomy = params.housekeeping_refseq_taxonomy +constitutive_refseq_taxonomy = params.constitutive_refseq_taxonomy ///////////////////////////////////////////////////////////// -/* -- END: Parse Housekeeping K-mer removal parameters -- */ +/* -- END: Parse constitutive K-mer removal parameters -- */ ///////////////////////////////////////////////////////////// @@ -651,12 +651,12 @@ if(params.translate_proteome_fasta) summary["Orpheum Translate Peptide fasta"] = if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide ksize'] = params.translate_peptide_ksize if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide molecule'] = params.translate_peptide_molecule if(params.translate_proteome_fasta) summary['Oprheum Translate Bloom filter table size'] = params.bloomfilter_tablesize -// Housekeeping k-mer removal paramters -if(params.housekeeping_protein_fasta) summary["Housekeping Peptide fasta"] = params.housekeeping_protein_fasta -if(params.housekeeping_rna_fasta) summary["Housekeping RNA fasta"] = params.housekeeping_rna_fasta -if(params.housekeeping_protein_sig) summary["Housekeping Peptide K-mer Signature"] = params.housekeeping_protein_sig -if(params.housekeeping_rna_sig) summary["Housekeping RNA K-mer Signature"] = params.housekeeping_rna_sig -if(need_refseq_download) summary["Housekeeping Refseq Taxonomy"] = params.housekeeping_refseq_taxonomy +// constitutive k-mer removal paramters +if(params.constitutive_protein_fasta) summary["Constitutive Peptide fasta"] = params.constitutive_protein_fasta +if(params.constitutive_rna_fasta) summary["Constitutive RNA fasta"] = params.constitutive_rna_fasta +if(params.constitutive_protein_sig) summary["Constitutive Peptide K-mer Signature"] = params.constitutive_protein_sig +if(params.constitutive_rna_sig) summary["Constitutive RNA K-mer Signature"] = params.constitutive_rna_sig +if(need_refseq_download) summary["Constitutive GBenes' Refseq Taxonomy"] = params.constitutive_refseq_taxonomy // Resource information summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" @@ -1512,11 +1512,11 @@ if ((params.bam || params.tenx_tgz) && !params.skip_compute && !params.skip_sig_ } -if (!params.skip_remove_housekeeping_genes) { +if (!params.skip_remove_constitutive_genes) { /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// /* -- -- */ - /* -- REMOVE K-MERS FROM HOUSEKEEPING GENES -- */ + /* -- REMOVE K-MERS FROM constitutive GENES -- */ /* -- -- */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// @@ -1534,9 +1534,9 @@ if (!params.skip_remove_housekeeping_genes) { * STEP 6 - rsync to download refeseq */ if (need_refseq_download){ - // No fastas provided for removing housekeeping genes + // No fastas provided for removing constitutive genes process download_refseq { - tag "${housekeeping_refseq_taxonomy}--${refseq_moltype}" + tag "${constitutive_refseq_taxonomy}--${refseq_moltype}" label "process_low" publishDir "${params.outdir}/reference/ncbi_refseq/", mode: 'copy' @@ -1544,11 +1544,11 @@ if (!params.skip_remove_housekeeping_genes) { val refseq_moltype from ch_refseq_moltypes_to_download output: - set val(refseq_moltype), file("${housekeeping_refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter + set val(refseq_moltype), file("${constitutive_refseq_taxonomy}--*.${refseq_moltype}.fa.gz") into ch_refseq_fasta_to_filter script: - output_fasta = "${housekeeping_refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz" - include_fasta = params.test_mini_refseq_download ? "${housekeeping_refseq_taxonomy}.1.${refseq_moltype}.f*a.gz" : "*${refseq_moltype}.f*a.gz" + output_fasta = "${constitutive_refseq_taxonomy}--\$RELEASE_NUMBER--\$DATE.${refseq_moltype}.fa.gz" + include_fasta = params.test_mini_refseq_download ? "${constitutive_refseq_taxonomy}.1.${refseq_moltype}.f*a.gz" : "*${refseq_moltype}.f*a.gz" """ rsync \\ --prune-empty-dirs \\ @@ -1557,77 +1557,77 @@ if (!params.skip_remove_housekeeping_genes) { --recursive \\ --include '${include_fasta}' \\ --exclude '/*' \\ - rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${housekeeping_refseq_taxonomy}/ . + rsync://ftp.ncbi.nlm.nih.gov/refseq/release/${constitutive_refseq_taxonomy}/ . wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER DATE=\$(date +'%Y-%m-%d') RELEASE_NUMBER=\$(cat RELEASE_NUMBER) - gzcat ${housekeeping_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} + gzcat ${constitutive_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} """ } /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// /* -- -- */ - /* -- REMOVE K-MERS FROM HOUSEKEEPING GENES -- */ + /* -- REMOVE K-MERS FROM constitutive GENES -- */ /* -- -- */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// /* - * STEP 7 - Get only housekeeping genes from + * STEP 7 - Get only constitutive genes from */ - // Keep genes whose names match housekeeping gene regular expression pattern - process extract_fasta_housekeeping { + // Keep genes whose names match constitutive gene regular expression pattern + process extract_fasta_constitutive { tag "${fasta.baseName}" label "process_low" - publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' + publishDir "${params.outdir}/reference/constitutive_genes/", mode: 'copy' input: set val(refseq_moltype), file(fasta) from ch_refseq_fasta_to_filter output: - set val(refseq_moltype), file(output_fasta_gz) into ch_housekeeping_fasta, ch_housekeeping_fasta_to_view + set val(refseq_moltype), file(output_fasta_gz) into ch_constitutive_fasta, ch_constitutive_fasta_to_view script: - output_fasta = "${fasta.baseName}__only_housekeeping_genes.fa" - output_fasta_gz = "${fasta.baseName}__only_housekeeping_genes.fa.gz" + output_fasta = "${fasta.baseName}__only_constitutive_genes.fa" + output_fasta_gz = "${fasta.baseName}__only_constitutive_genes.fa.gz" """ filter_fasta_regex.py \\ --input-fasta ${fasta} \\ --output-fasta ${output_fasta} \\ - --regex-pattern '${params.housekeeping_gene_regex}' + --regex-pattern '${params.constitutive_gene_regex}' gzip -c ${output_fasta} > ${output_fasta_gz} """ } - ch_housekeeping_fasta_to_view - .dump( tag: 'ch_housekeeping_fasta' ) + ch_constitutive_fasta_to_view + .dump( tag: 'ch_constitutive_fasta' ) } - if (!have_housekeeping_sigs) { + if (!have_constitutive_sigs) { /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// /* -- -- */ - /* -- COMPUTE HOUSEKEEPING GENE K-MER SIGNATURE -- */ + /* -- COMPUTE constitutive GENE K-MER SIGNATURE -- */ /* -- -- */ /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// /* - * STEP 8 - Compute Housekeeping Gene K-mer Signature + * STEP 8 - Compute constitutive Gene K-mer Signature */ - // No fastas provided for removing housekeeping genes - process compute_housekeeping_kmer_sig { + // No fastas provided for removing constitutive genes + process compute_constitutive_kmer_sig { tag "${fasta.baseName}" label "process_low" - publishDir "${params.outdir}/reference/housekeeping_genes/", mode: 'copy' + publishDir "${params.outdir}/reference/constitutive_genes/", mode: 'copy' input: val track_abundance val sketch_value_parsed val sketch_style_parsed - set val(refseq_moltype), file(fasta) from ch_housekeeping_fasta + set val(refseq_moltype), file(fasta) from ch_constitutive_fasta output: - set val(sourmash_moltypes), file(sig) into ch_housekeeping_sig + set val(sourmash_moltypes), file(sig) into ch_constitutive_sig script: is_protein = refseq_moltype == "protein" @@ -1675,14 +1675,14 @@ if (!params.skip_remove_housekeeping_genes) { .dump( tag: 'ch_sourmash_sketches_moltype_to_sig__groupTuple' ) .set { ch_sourmash_sketches_moltype_to_sigs } - ch_housekeeping_sig - .dump( tag: 'ch_housekeeping_sig' ) + ch_constitutive_sig + .dump( tag: 'ch_constitutive_sig' ) .transpose() - .dump( tag: 'ch_housekeeping_sig__transposed' ) + .dump( tag: 'ch_constitutive_sig__transposed' ) .combine( ch_sourmash_params_for_subtract, by: 0) - .dump( tag: 'ch_housekeeping_sig__transposed__combined' ) + .dump( tag: 'ch_constitutive_sig__transposed__combined' ) .combine ( ch_sourmash_sketches_moltype_to_sigs, by: 0 ) - .dump( tag: 'ch_housekeeping_sig__transposed__combined_joined' ) + .dump( tag: 'ch_constitutive_sig__transposed__combined_joined' ) .into { ch_subtract_params_with_sigs; ch_subtract_params_to_sigs_for_siglist } ch_subtract_params_to_sigs_for_siglist @@ -1712,22 +1712,22 @@ if (!params.skip_remove_housekeeping_genes) { // /////////////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////////////////////// // /* -- -- */ - // /* -- REMOVE K-MERS FROM HOUSEKEEPING GENES -- */ + // /* -- REMOVE K-MERS FROM constitutive GENES -- */ // /* -- -- */ // /////////////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////////////////////// // /* - // * STEP 9 - Remove housekeeping gene k-mers from single cells + // * STEP 9 - Remove constitutive gene k-mers from single cells // */ process subtract_houskeeping_kmers { tag "${subtract_id}" label "process_medium" - publishDir "${params.outdir}/sketches_subtract_housekeeping_kmers/${subtract_id}", mode: 'copy' + publishDir "${params.outdir}/sketches_subtract_constitutive_kmers/${subtract_id}", mode: 'copy' input: val sketch_value_parsed val sketch_style_parsed - set val(molecule), val(ksize), file(housekeeping_sig), file(sigs), file(siglist) from ch_sigs_with_houskeeping_sig_to_subtract + set val(molecule), val(ksize), file(constitutive_sig), file(sigs), file(siglist) from ch_sigs_with_houskeeping_sig_to_subtract output: set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_houskeeping_removed @@ -1747,7 +1747,7 @@ if (!params.skip_remove_housekeeping_genes) { --ksize ${ksize} \\ --encoding ${molecule} \\ --output subtracted/ \\ - ${housekeeping_sig} \\ + ${constitutive_sig} \\ ${siglist} """ } diff --git a/nextflow.config b/nextflow.config index 780e4b04..ee47e75f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,16 +58,16 @@ params { save_translate_csv = false save_translate_json = false - // Housekeeping gene k-mer removal - skip_remove_housekeeping_genes = false - housekeeping_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH" - housekeeping_refseq_taxonomy = 'vertebrate_mammalian' + // constitutive gene k-mer removal + skip_remove_constitutive_genes = false + constitutive_gene_regex = "ribosom|mito|ubiqui|ferritin|cytochrome|eukaryotic translation|heat shock|NADH|NADPH" + constitutive_refseq_taxonomy = 'vertebrate_mammalian' // For testing purposes --> use a small refseq dataset test_mini_refseq_download = false - housekeeping_protein_fasta = false - housekeeping_rna_fasta = false - housekeeping_protein_sig = false - housekeeping_rna_sig = false + constitutive_protein_fasta = false + constitutive_rna_fasta = false + constitutive_protein_sig = false + constitutive_rna_sig = false // ska options split_kmer = false @@ -156,9 +156,9 @@ profiles { test_ska { includeConfig 'conf/test_ska.config' } test_bam { includeConfig 'conf/test_bam.config' } test_fastas { includeConfig 'conf/test_fastas.config' } - test_housekeeping_from_download_refseq { includeConfig 'conf/test_housekeeping_from_download_refseq.config' } - test_housekeeping_from_fasta { includeConfig 'conf/test_housekeeping_from_fasta.config' } - test_housekeeping_from_sig { includeConfig 'conf/test_housekeeping_from_sig.config' } + test_constitutive_from_download_refseq { includeConfig 'conf/test_constitutive_from_download_refseq.config' } + test_constitutive_from_fasta { includeConfig 'conf/test_constitutive_from_fasta.config' } + test_constitutive_from_sig { includeConfig 'conf/test_constitutive_from_sig.config' } test_protein_fastas { includeConfig 'conf/test_protein_fastas.config' } test_remove_ribo { includeConfig 'conf/test_remove_ribo.config' } test_sig_merge { includeConfig 'conf/test_sig_merge.config' } From d9cbf42ddd7f7c723328fdb468e643f8578435cc Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 15 Mar 2021 12:58:53 -0700 Subject: [PATCH 29/43] Add explicit path for conda bin --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 911e2f04..263e0713 100755 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git # Soft link all conda C-related libraries to their non-prefixed name # for rust to be able to build the C libraries -RUN for f in $(ls $CONDA_PREFIX/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done +RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done RUN cd 2021-01-27-olga-remove-protein && cargo build --release # Add "subtract" command to path ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH From ea742f3efe01cc483c9813899c06cf378d6be7cd Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 15 Mar 2021 13:33:16 -0700 Subject: [PATCH 30/43] Actually do soft links --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 263e0713..4e914ee7 100755 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git # Soft link all conda C-related libraries to their non-prefixed name # for rust to be able to build the C libraries -RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g ; done +RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g; ln -s $f $g ; done RUN cd 2021-01-27-olga-remove-protein && cargo build --release # Add "subtract" command to path ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH From 8d73e3078398f87cdf64d5b25e1c64460bcf7dfe Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 15 Mar 2021 13:40:37 -0700 Subject: [PATCH 31/43] Update whitespace to make dockerfile more readable --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4e914ee7..a0cb55b8 100755 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,8 @@ RUN conda env export --name nf-core-kmermaid-0.1.0dev > nf-core-kmermaid-0.1.0de RUN git clone -b olgabot/mut-warning https://github.com/olgabot/2021-01-27-olga-remove-protein.git # Soft link all conda C-related libraries to their non-prefixed name # for rust to be able to build the C libraries -RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g; ln -s $f $g ; done +RUN for f in $(ls /opt/conda/envs/nf-core-kmermaid-0.1.0dev/bin/x86_64-conda_cos6-linux-gnu*); \ + do g=$(echo $f | sed 's:x86_64-conda_cos6-linux-gnu-::') ; echo $g; ln -s $f $g ; done RUN cd 2021-01-27-olga-remove-protein && cargo build --release # Add "subtract" command to path ENV PATH $HOME/2021-01-27-olga-remove-protein/target/release:$PATH From b4e91b819837804a15314fd3999da79aef41d503 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 15 Mar 2021 15:03:45 -0700 Subject: [PATCH 32/43] Add separate creation of ch_refseq_moltypes_to_download --- main.nf | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/main.nf b/main.nf index 992f95d8..32ddf86b 100644 --- a/main.nf +++ b/main.nf @@ -565,6 +565,13 @@ if (have_constitutive_fastas) { // Take only the first item, the molecule type .map{ it[0] } .set{ ch_refseq_moltypes_to_download } +} else { + // Don't look at the fastas, only check the parsed molecule types + Channel.from(['protein', 'rna']) + .filter{ + it == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 + } + .set{ ch_refseq_moltypes_to_download } } if (have_constitutive_sigs) { From 403ab49d4d4152060c508115b486b66b10056a2d Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 15 Mar 2021 15:04:00 -0700 Subject: [PATCH 33/43] Update tests to all use mini refseq data --- conf/test.config | 1 + conf/test_bam.config | 2 ++ conf/test_fastas.config | 2 ++ conf/test_full.config | 1 + conf/test_protein_fastas.config | 2 +- conf/test_remove_ribo.config | 1 + conf/test_sig_merge.config | 1 + conf/test_tenx_tgz.config | 2 +- conf/test_translate.config | 1 + conf/test_translate_bam.config | 1 + 10 files changed, 12 insertions(+), 2 deletions(-) diff --git a/conf/test.config b/conf/test.config index 4d4dced7..58f777a6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,4 +29,5 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] + test_mini_refseq_download = true } diff --git a/conf/test_bam.config b/conf/test_bam.config index 8bcdb775..7de684af 100644 --- a/conf/test_bam.config +++ b/conf/test_bam.config @@ -28,4 +28,6 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 2 + test_mini_refseq_download = true + } diff --git a/conf/test_fastas.config b/conf/test_fastas.config index a6509d4e..16cdcfb6 100644 --- a/conf/test_fastas.config +++ b/conf/test_fastas.config @@ -26,4 +26,6 @@ params { ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']], ] + test_mini_refseq_download = true + } diff --git a/conf/test_full.config b/conf/test_full.config index 5dfaeafb..d6db2819 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -18,4 +18,5 @@ params { ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']], ['K562', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_2.fastq.gz', 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_2.fastq.gz']] ] + test_mini_refseq_download = true } diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config index ea22bcb6..0d91f942 100644 --- a/conf/test_protein_fastas.config +++ b/conf/test_protein_fastas.config @@ -29,5 +29,5 @@ params { sketch_scaled = 2 molecules = 'protein,dayhoff,hp' read_pairs = false - + test_mini_refseq_download = true } diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config index 8aa689ac..884b654a 100644 --- a/conf/test_remove_ribo.config +++ b/conf/test_remove_ribo.config @@ -31,4 +31,5 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] + test_mini_refseq_download = true } diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config index 21a27939..5c761cf0 100644 --- a/conf/test_sig_merge.config +++ b/conf/test_sig_merge.config @@ -29,4 +29,5 @@ params { reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' bloomfilter_tablesize = '1e6' + test_mini_refseq_download = true } diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config index 39b9b2f0..62f6f3fa 100644 --- a/conf/test_tenx_tgz.config +++ b/conf/test_tenx_tgz.config @@ -29,5 +29,5 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 10 - shard_size = 350 + test_mini_refseq_download = true } diff --git a/conf/test_translate.config b/conf/test_translate.config index c6e488a5..5208d60d 100644 --- a/conf/test_translate.config +++ b/conf/test_translate.config @@ -25,4 +25,5 @@ params { bloomfilter_tablesize = '1e8' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' + test_mini_refseq_download = true } diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index 15365382..27194329 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -31,4 +31,5 @@ params { bloomfilter_tablesize = '1e6' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' + test_mini_refseq_download = true } From 9a1af8ad14256c5fd6b8a64aa82b1025019ad6ee Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 15 Mar 2021 15:14:32 -0700 Subject: [PATCH 34/43] Pipeline is running! --- conf/test.config | 3 +++ conf/test_bam.config | 3 ++- conf/test_fastas.config | 4 ++-- conf/test_full.config | 1 - conf/test_protein_fastas.config | 4 +++- conf/test_remove_ribo.config | 4 +++- conf/test_sig_merge.config | 4 +++- conf/test_tenx_tgz.config | 3 ++- conf/test_translate.config | 6 +++++- conf/test_translate_bam.config | 4 +++- main.nf | 11 ++++++++++- 11 files changed, 36 insertions(+), 11 deletions(-) diff --git a/conf/test.config b/conf/test.config index 58f777a6..92549b30 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,5 +29,8 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] + // Remove constitutively expressed genes test_mini_refseq_download = true + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + // constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" } diff --git a/conf/test_bam.config b/conf/test_bam.config index 7de684af..1f6dff7a 100644 --- a/conf/test_bam.config +++ b/conf/test_bam.config @@ -28,6 +28,7 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 2 - test_mini_refseq_download = true + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" } diff --git a/conf/test_fastas.config b/conf/test_fastas.config index 16cdcfb6..34ea3dbc 100644 --- a/conf/test_fastas.config +++ b/conf/test_fastas.config @@ -26,6 +26,6 @@ params { ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']], ] - test_mini_refseq_download = true - + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" } diff --git a/conf/test_full.config b/conf/test_full.config index d6db2819..5dfaeafb 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -18,5 +18,4 @@ params { ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']], ['K562', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_2.fastq.gz', 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_2.fastq.gz']] ] - test_mini_refseq_download = true } diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config index 0d91f942..bd3d28ea 100644 --- a/conf/test_protein_fastas.config +++ b/conf/test_protein_fastas.config @@ -29,5 +29,7 @@ params { sketch_scaled = 2 molecules = 'protein,dayhoff,hp' read_pairs = false - test_mini_refseq_download = true + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + } diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config index 884b654a..40b320be 100644 --- a/conf/test_remove_ribo.config +++ b/conf/test_remove_ribo.config @@ -31,5 +31,7 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] - test_mini_refseq_download = true + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + } diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config index 5c761cf0..42dbd539 100644 --- a/conf/test_sig_merge.config +++ b/conf/test_sig_merge.config @@ -29,5 +29,7 @@ params { reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' bloomfilter_tablesize = '1e6' - test_mini_refseq_download = true + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + } diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config index 62f6f3fa..648c5f30 100644 --- a/conf/test_tenx_tgz.config +++ b/conf/test_tenx_tgz.config @@ -29,5 +29,6 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 10 - test_mini_refseq_download = true + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" } diff --git a/conf/test_translate.config b/conf/test_translate.config index 5208d60d..ba3b3c09 100644 --- a/conf/test_translate.config +++ b/conf/test_translate.config @@ -25,5 +25,9 @@ params { bloomfilter_tablesize = '1e8' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' - test_mini_refseq_download = true + + // Remove constitutively expressed genes + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + } diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index 27194329..ae9d5c8b 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -31,5 +31,7 @@ params { bloomfilter_tablesize = '1e6' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' - test_mini_refseq_download = true + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + } diff --git a/main.nf b/main.nf index 32ddf86b..ceed6dc9 100644 --- a/main.nf +++ b/main.nf @@ -581,6 +581,15 @@ if (have_constitutive_sigs) { ["protein,dayhoff", file(constitutive_protein_sig)], ["dna", file(constitutive_rna_sig)] ) + + ch_refseq_moltype_to_fasta + // Check if protein molecules were even specified + .filter{ + it[0] == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 + } + // Take only the first item, the molecule type + .map{ it[0] } + .set{ ch_refseq_moltypes_to_download } } @@ -1568,7 +1577,7 @@ if (!params.skip_remove_constitutive_genes) { wget https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER DATE=\$(date +'%Y-%m-%d') RELEASE_NUMBER=\$(cat RELEASE_NUMBER) - gzcat ${constitutive_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} + zcat ${constitutive_refseq_taxonomy}.*.${refseq_moltype}*.gz | gzip -c - > ${output_fasta} """ } From a565bd36462a90d095ccc01a9f6a77c0d92d6401 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 15 Mar 2021 15:51:20 -0700 Subject: [PATCH 35/43] Get "sourmash compare" to run --- main.nf | 101 +++++++++++++++++++++++++++----------------------------- 1 file changed, 49 insertions(+), 52 deletions(-) diff --git a/main.nf b/main.nf index ceed6dc9..065020d3 100644 --- a/main.nf +++ b/main.nf @@ -446,6 +446,8 @@ Channel.from(params.ksizes?.toString().tokenize(',')) molecules = params.molecules?.toString().tokenize(',') nucleotide_molecules = molecules.findAll { it == "dna" } peptide_molecules = molecules.findAll { it != "dna" } +// have_protein_input = params.translate_proteome_fasta || params.protein_fastas || protein_input +// peptide_molecules = peptide_molecules_comma_separated = peptide_molecules.join(",") peptide_molecule_flags = peptide_molecules.collect { it -> "--${it}" }.join ( " " ) @@ -1721,8 +1723,8 @@ if (!params.skip_remove_constitutive_genes) { .map{ [it[0], it[2], it[1], it[3]] } .dump ( tag: 'ch_subtract_params_with_sigs__map' ) .combine( ch_subtract_params_with_siglist, by: [0, 1] ) - .dump( tag: 'ch_sigs_with_houskeeping_sig_to_subtract' ) - .set { ch_sigs_with_houskeeping_sig_to_subtract } + .dump( tag: 'ch_sigs_with_constitutive_sig_to_subtract' ) + .set { ch_sigs_with_constitutive_sig_to_subtract } // /////////////////////////////////////////////////////////////////////////////// @@ -1735,7 +1737,7 @@ if (!params.skip_remove_constitutive_genes) { // /* // * STEP 9 - Remove constitutive gene k-mers from single cells // */ - process subtract_houskeeping_kmers { + process subtract_constitutive_kmers { tag "${subtract_id}" label "process_medium" publishDir "${params.outdir}/sketches_subtract_constitutive_kmers/${subtract_id}", mode: 'copy' @@ -1743,10 +1745,10 @@ if (!params.skip_remove_constitutive_genes) { input: val sketch_value_parsed val sketch_style_parsed - set val(molecule), val(ksize), file(constitutive_sig), file(sigs), file(siglist) from ch_sigs_with_houskeeping_sig_to_subtract + set val(molecule), val(ksize), file(constitutive_sig), file(sigs), file(siglist) from ch_sigs_with_constitutive_sig_to_subtract output: - set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_houskeeping_removed + set val(molecule), val(ksize), file("subtracted/*.sig") into ch_sigs_constitutive_removed script: subtract_id = "${molecule}__k-${ksize}" @@ -1767,6 +1769,29 @@ if (!params.skip_remove_constitutive_genes) { ${siglist} """ } + + ch_sigs_constitutive_removed + // .groupTuple( by: [0, 1] ) + .transpose( by: 2 ) + .set{ ch_sourmash_sketches_to_compare } + +} else { + ch_sourmash_sketches_merged + .map { [tuple(it[2].split(",")), it[4]] } + .dump(tag: 'ch_sourmash_sketches_merged__map_split' ) + .transpose() + .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' ) + // Perform cartesian product on the molecules with compare params + .combine( ch_sourmash_params_for_compare, by: 0) + .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' ) + // .groupTuple(by: [0, 2]) + .dump(tag: 'ch_sourmash_sketches_to_compare' ) + // Reorder so signature files are last + // moltype, ksize, signature file + .map { [it[0], it[2], it[1]] } + .set { ch_sourmash_sketches_to_compare } + + ch_sourmash_sig_describe_merged = Channel.empty() } @@ -1793,51 +1818,23 @@ if (params.split_kmer){ } // If skip_compute is true, skip compare must be specified as true as well if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { - // // Combine peptide and nucleotide sketches - // sourmash_sketches_nucleotide - // .collect() - // // Set as a list so that combine does cartesian product of all signatures - // .map { it -> [it] } - // .combine( ch_ksizes_for_compare_nucleotide ) - // .dump( tag: 'sourmash_sketches_nucleotide__ksizes' ) - // .map { x -> [x[0], x[1], 'dna'] } - // .dump( tag: 'sourmash_sketches_nucleotide__ksizes__molecules' ) - // .set { sourmash_sketches_nucleotide_for_compare } - - // sourmash_sketches_peptide - // .collect() - // // Set as a list so that combine does cartesian product of all signatures - // .map { it -> [it] } - // .combine( ch_ksizes_for_compare_petide ) - // .dump( tag: 'sourmash_sketches_peptide__ksizes' ) - // .combine( ch_peptide_molecules ) - // .dump( tag: 'sourmash_sketches_peptide__ksizes__molecules' ) - // .set { sourmash_sketches_peptide_for_compare } - - // sourmash_sketches_peptide_for_compare - // .mix ( sourmash_sketches_nucleotide_for_compare ) - // .set { ch_sourmash_sketches_to_compare } - - // ch_sourmash_sketches_to_compare = Channel.empty() - - - ch_sourmash_sketches_merged = Channel.empty() + ch_sourmash_compare_sketch_params_to_sketches = Channel.create() - ch_sourmash_sketches_merged - // Drop first index (index 0) which is the cell id - // Drop the second index (index 1) which is the sketch id - // Keep only moltype - // Drop ksize - .map { [tuple(it[2].split(",")), it[4]] } - .dump(tag: 'ch_sourmash_sketches_merged__map_split' ) - .transpose() - .dump(tag: 'ch_sourmash_sketches_merged__map_split__tranpose' ) - // Perform cartesian product on the molecules with compare params - .combine( ch_sourmash_params_for_compare, by: 0) - .dump(tag: 'ch_sourmash_sketches_merged__map_split__combine' ) - .groupTuple(by: [0, 2]) - .dump(tag: 'ch_sourmash_sketches_to_compare' ) - .set { ch_sourmash_sketches_to_compare } + ch_sourmash_sketches_to_compare + .tap ( ch_sourmash_compare_sketch_params_to_sketches ) + .dump( tag: 'ch_compare_params_to_sigs_for_siglist__transpose' ) + .collectFile() { it -> + [ "${it[0]}__${it[1]}.txt", "${it[2].getFileName()}\n"] + } + .dump ( tag: 'ch_compare_params_to_sigs_for_siglist__transpose__collectfile' ) + .map { [ tuple( it.baseName.split('__') ), it] } + .map { [ it[0][0], it[0][1], it[1] ] } + .dump ( tag: 'ch_compare_params_with_siglist' ) + .combine( ch_sourmash_compare_sketch_params_to_sketches, by: [0, 1] ) + .dump( tag: 'ch_compare_params_with_siglist__add_sketches' ) + .groupTuple( by: [0, 1, 2] ) + .dump ( tag: 'ch_compare_params_with_siglist__add_sketches__groupTuple' ) + .set { ch_sourmash_params_to_siglist_sketches } process sourmash_compare_sketches { // Combine peptide and nucleotide sketches @@ -1845,8 +1842,8 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { publishDir "${params.outdir}/compare_sketches", mode: 'copy' input: - // Weird order but that's how it shakes out with the groupTuple - set val(molecule), file("*.sig"), val(ksize) from ch_sourmash_sketches_to_compare + // file(sigs) is necessary to stage all the signature files present in file(siglist) + set val(molecule), val(ksize), file(siglist), file(sigs) from ch_sourmash_params_to_siglist_sketches output: file(csv) @@ -1861,7 +1858,7 @@ if (!params.split_kmer && !params.skip_compare && !params.skip_compute) { --${molecule} \\ --csv ${csv} \\ ${processes} \\ - --traverse-directory . + --from-file ${siglist} # Use --traverse-directory instead of all the files explicitly to avoid # "too many arguments" error for bash when there are lots of samples """ From f0337b87601ffe48272ea7adf76ae4f09e281a23 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 15 Mar 2021 15:56:51 -0700 Subject: [PATCH 36/43] Update constitutive rna sig for all configs --- conf/test.config | 5 ++--- conf/test_bam.config | 6 ++---- conf/test_fastas.config | 5 ++--- conf/test_full.config | 1 - conf/test_protein_fastas.config | 6 ++---- conf/test_remove_ribo.config | 6 ++---- conf/test_sig_merge.config | 6 ++---- conf/test_tenx_tgz.config | 5 ++--- conf/test_translate.config | 6 ++---- conf/test_translate_bam.config | 6 ++---- 10 files changed, 18 insertions(+), 34 deletions(-) diff --git a/conf/test.config b/conf/test.config index 92549b30..f6b25ac9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -17,7 +17,6 @@ params { // Input data // samples = 'testing/samples.csv' // fastas = 'testing/fastas/*.fasta' - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" @@ -31,6 +30,6 @@ params { ] // Remove constitutively expressed genes test_mini_refseq_download = true - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - // constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_bam.config b/conf/test_bam.config index 1f6dff7a..12106f56 100644 --- a/conf/test_bam.config +++ b/conf/test_bam.config @@ -19,7 +19,6 @@ params { 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" @@ -28,7 +27,6 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 2 - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" - + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_fastas.config b/conf/test_fastas.config index 34ea3dbc..3e893067 100644 --- a/conf/test_fastas.config +++ b/conf/test_fastas.config @@ -17,7 +17,6 @@ params { // Input data // samples = 'testing/samples.csv' // fastas = 'testing/fastas/*.fasta' - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" @@ -26,6 +25,6 @@ params { ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']], ] - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_full.config b/conf/test_full.config index 5dfaeafb..7c0d46dc 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -12,7 +12,6 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' input_paths = [ ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']], diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config index bd3d28ea..35d88c36 100644 --- a/conf/test_protein_fastas.config +++ b/conf/test_protein_fastas.config @@ -26,10 +26,8 @@ params { ['https://github.com/czbiohub/test-datasets/raw/predictorthologs/testdata/bonobo_liver_ptprc__molecule-dayhoff__coding_reads_peptides.fasta']]] // Sketch Parameters - sketch_scaled = 2 molecules = 'protein,dayhoff,hp' read_pairs = false - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" - + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config index 40b320be..722c42a7 100644 --- a/conf/test_remove_ribo.config +++ b/conf/test_remove_ribo.config @@ -17,7 +17,6 @@ params { // Input data // samples = 'testing/samples.csv' // fastas = 'testing/fastas/*.fasta' - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' // read_pairs = 'testing/fastqs/*{1,2}.fastq.gz' // sra = "SRP016501" @@ -31,7 +30,6 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" - + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config index 42dbd539..54d28fe8 100644 --- a/conf/test_sig_merge.config +++ b/conf/test_sig_merge.config @@ -18,7 +18,6 @@ params { bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" @@ -29,7 +28,6 @@ params { reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' bloomfilter_tablesize = '1e6' - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" - + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config index 648c5f30..292bfbf9 100644 --- a/conf/test_tenx_tgz.config +++ b/conf/test_tenx_tgz.config @@ -20,7 +20,6 @@ params { 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid-unaligned-tgz-v3/testdata/mouse_brown_fat_ptprc_plus_unaligned.tgz' ] // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" @@ -29,6 +28,6 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 10 - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_translate.config b/conf/test_translate.config index ba3b3c09..fa69416c 100644 --- a/conf/test_translate.config +++ b/conf/test_translate.config @@ -17,7 +17,6 @@ params { // Input data fastas = "https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_transcripts.subsample5.fa" // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false @@ -27,7 +26,6 @@ params { translate_peptide_molecule = 'dayhoff' // Remove constitutively expressed genes - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" - + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index ae9d5c8b..db9a7164 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -18,7 +18,6 @@ params { bam = ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_lung.bam', 'https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/mouse_brown_fat_ptprc_plus_unaligned.bam'] // Sketch Parameters - sketch_scaled = 2 molecules = 'dna,protein,dayhoff' read_pairs = false save_fastas = "fastas" @@ -31,7 +30,6 @@ params { bloomfilter_tablesize = '1e6' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" - + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } From dbf97f9bad463429598aa6d7797d6cb22a4c34c8 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 16 Mar 2021 10:18:29 -0700 Subject: [PATCH 37/43] Add test_bam alone --- .github/workflows/ci.yml | 1 + main.nf | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 76b2cbeb..14376386 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -66,6 +66,7 @@ jobs: - "test --sketch_scaled false --sketch_scaled_log2 2" - "test --sketch_scaled false --sketch_num_hashes 20" - "test --sketch_scaled false --sketch_num_hashes_log2 20" + - "test_bam" - "test_bam --barcodes_file false --rename_10x_barcodes false --save_fastas false --write_barcodes_meta_csv false" - "test_bam --rename_10x_barcodes false --write_barcodes_meta_csv false" - "test_bam --skip_sig_merge" diff --git a/main.nf b/main.nf index 065020d3..ca704c05 100644 --- a/main.nf +++ b/main.nf @@ -579,15 +579,21 @@ if (have_constitutive_fastas) { if (have_constitutive_sigs) { // Use sourmash moltypes of "protein,dayhoff" instead of the original protein // as used for the fastas as that's what matches the sourmash outputs - ch_constitutive_sig = Channel.from( + Channel.from( ["protein,dayhoff", file(constitutive_protein_sig)], - ["dna", file(constitutive_rna_sig)] - ) + ["dna", file(constitutive_rna_sig)]) + .set { ch_constitutive_sig } - ch_refseq_moltype_to_fasta + // Refseq molecule types are "protein" and "rna" + Channel.from( + ["protein", file(constitutive_protein_sig)], + ["rna", file(constitutive_rna_sig)]) + .into { ch_refseq_moltype_to_sig } + + ch_refseq_moltype_to_sig // Check if protein molecules were even specified .filter{ - it[0] == "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 + it[0]== "protein" ? peptide_molecules.size() > 0 : nucleotide_molecules.size() > 0 } // Take only the first item, the molecule type .map{ it[0] } From 2ed90f8fa608bf5de28a41287d92784a90e75618 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 16 Mar 2021 10:18:42 -0700 Subject: [PATCH 38/43] Update constitutive signatures --- conf/test.config | 4 ++-- conf/test_bam.config | 4 ++-- conf/test_constitutive_from_fasta.config | 4 ++-- conf/test_constitutive_from_sig.config | 4 ++-- conf/test_fastas.config | 4 ++-- conf/test_full.config | 3 +++ conf/test_protein_fastas.config | 4 ++-- conf/test_remove_ribo.config | 4 ++-- conf/test_sig_merge.config | 4 ++-- conf/test_tenx_tgz.config | 4 ++-- conf/test_translate.config | 4 ++-- conf/test_translate_bam.config | 4 ++-- 12 files changed, 25 insertions(+), 22 deletions(-) diff --git a/conf/test.config b/conf/test.config index f6b25ac9..cf19689a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,6 +30,6 @@ params { ] // Remove constitutively expressed genes test_mini_refseq_download = true - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_bam.config b/conf/test_bam.config index 12106f56..07579d50 100644 --- a/conf/test_bam.config +++ b/conf/test_bam.config @@ -27,6 +27,6 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 2 - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_constitutive_from_fasta.config b/conf/test_constitutive_from_fasta.config index 2bf6ba6d..8be0ba00 100644 --- a/conf/test_constitutive_from_fasta.config +++ b/conf/test_constitutive_from_fasta.config @@ -24,8 +24,8 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] - housekeeping_protein_fasta = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_housekeeping_genes.fa.gz" - housekeeping_rna_fasta = "https://github.com/nf-core/test-datasets/raw/55ee053a1ef69fd440f8e39ac3aebae0aa6a6e69/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_housekeeping_genes.fa.gz" + constitutive_protein_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa.gz" + constitutive_rna_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa.gz" reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa' bloomfilter_tablesize = '1e6' diff --git a/conf/test_constitutive_from_sig.config b/conf/test_constitutive_from_sig.config index 12a88fdb..0e2bad4d 100644 --- a/conf/test_constitutive_from_sig.config +++ b/conf/test_constitutive_from_sig.config @@ -24,6 +24,6 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-2__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-2__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_fastas.config b/conf/test_fastas.config index 3e893067..b439e03c 100644 --- a/conf/test_fastas.config +++ b/conf/test_fastas.config @@ -25,6 +25,6 @@ params { ['SRR4050380', ['https://github.com/nf-core/test-datasets/raw/olgabot/kmermaid--bam-unique-names/testdata/SRR4050380_pass_concatenated.fasta']], ] - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_full.config b/conf/test_full.config index 7c0d46dc..ac6e6677 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -17,4 +17,7 @@ params { ['GM12878', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/007/SRR3192657/SRR3192657_2.fastq.gz','ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192658/SRR3192658_2.fastq.gz']], ['K562', ['ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/008/SRR3192408/SRR3192408_2.fastq.gz', 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_1.fastq.gz,ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR319/009/SRR3192409/SRR3192409_2.fastq.gz']] ] + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + } diff --git a/conf/test_protein_fastas.config b/conf/test_protein_fastas.config index 35d88c36..91a2325d 100644 --- a/conf/test_protein_fastas.config +++ b/conf/test_protein_fastas.config @@ -28,6 +28,6 @@ params { // Sketch Parameters molecules = 'protein,dayhoff,hp' read_pairs = false - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_remove_ribo.config b/conf/test_remove_ribo.config index 722c42a7..72c2710b 100644 --- a/conf/test_remove_ribo.config +++ b/conf/test_remove_ribo.config @@ -30,6 +30,6 @@ params { ['SRR4238351', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238351_subsamp.fastq.gz']], ['SRR4238355', ['https://github.com/nf-core/test-datasets/raw/rnaseq/testdata/SRR4238355_subsamp.fastq.gz']], ] - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_sig_merge.config b/conf/test_sig_merge.config index 54d28fe8..ad821450 100644 --- a/conf/test_sig_merge.config +++ b/conf/test_sig_merge.config @@ -28,6 +28,6 @@ params { reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' bloomfilter_tablesize = '1e6' - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config index 292bfbf9..eaf23f7a 100644 --- a/conf/test_tenx_tgz.config +++ b/conf/test_tenx_tgz.config @@ -28,6 +28,6 @@ params { // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 10 - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_translate.config b/conf/test_translate.config index fa69416c..be799ae9 100644 --- a/conf/test_translate.config +++ b/conf/test_translate.config @@ -26,6 +26,6 @@ params { translate_peptide_molecule = 'dayhoff' // Remove constitutively expressed genes - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index db9a7164..cc6f1cae 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -30,6 +30,6 @@ params { bloomfilter_tablesize = '1e6' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' - constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-10.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" - constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--constitutive-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" + constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" } From 6a694f0264c2dd52d180d3890c8def19730eb36e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 16 Mar 2021 10:34:35 -0700 Subject: [PATCH 39/43] housekeeping --> constitutive --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 14376386..7c95ece8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,9 +74,9 @@ jobs: - "test_bam --barcodes_file false --rename_10x_barcodes false" - "test_bam --rename_10x_barcodes false" - "test_fastas" - - "test_housekeeping_from_download_refseq" - - "test_housekeeping_from_fasta" - - "test_housekeeping_from_sig" + - "test_constitutive_from_download_refseq" + - "test_constitutive_from_fasta" + - "test_constitutive_from_sig" - "test_protein_fastas" - "test_remove_ribo" - "test_sig_merge" From ddaed1cad89074f8b47b7affe87ac111eac2d3e6 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 16 Mar 2021 14:50:13 -0700 Subject: [PATCH 40/43] Reference proteome fasta --> translate_proteome_fasta --- conf/test_translate.config | 2 +- conf/test_translate_bam.config | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/test_translate.config b/conf/test_translate.config index be799ae9..c4a7bccd 100644 --- a/conf/test_translate.config +++ b/conf/test_translate.config @@ -20,7 +20,7 @@ params { molecules = 'dna,protein,dayhoff' read_pairs = false - reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa' + translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa' bloomfilter_tablesize = '1e8' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' diff --git a/conf/test_translate_bam.config b/conf/test_translate_bam.config index cc6f1cae..4f8a5487 100644 --- a/conf/test_translate_bam.config +++ b/conf/test_translate_bam.config @@ -24,9 +24,9 @@ params { write_barcode_meta_csv = "metadata.csv" // For bam, each fasta record represents each barcode and each should have a signature // they should not be merged, For computation on bam file using sourmash, please set true for the below flag - tenx_min_umi_per_cell = 5 + tenx_min_umi_per_cell = 2 - reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' + translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/ptprc_bam_translations.fa' bloomfilter_tablesize = '1e6' translate_peptide_ksize = '11' translate_peptide_molecule = 'dayhoff' From e7ff62f2a48d6e6ecb1569e85fa72f6b030090e2 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 16 Mar 2021 14:50:25 -0700 Subject: [PATCH 41/43] Move bam to input section --- conf/test_tenx_tgz.config | 1 - main.nf | 12 ++++++------ nextflow.config | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/conf/test_tenx_tgz.config b/conf/test_tenx_tgz.config index eaf23f7a..10ae33ab 100644 --- a/conf/test_tenx_tgz.config +++ b/conf/test_tenx_tgz.config @@ -26,7 +26,6 @@ params { save_intermediate_files = "/tmp/" write_barcode_meta_csv = "metadata.csv" // For bam, each fasta record represents each barcode and each should have a signature - // they should not be merged, For computation on bam file using sourmash, please set true for the below flag tenx_min_umi_per_cell = 10 constitutive_protein_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa__molecule-protein,dayhoff__ksize-21,30,51__scaled-10__track_abundance-true.sig" constitutive_rna_sig = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa__molecule-dna__ksize-21,30,51__scaled-10__track_abundance-true.sig" diff --git a/main.nf b/main.nf index ca704c05..0d4890a9 100644 --- a/main.nf +++ b/main.nf @@ -588,7 +588,7 @@ if (have_constitutive_sigs) { Channel.from( ["protein", file(constitutive_protein_sig)], ["rna", file(constitutive_rna_sig)]) - .into { ch_refseq_moltype_to_sig } + .set { ch_refseq_moltype_to_sig } ch_refseq_moltype_to_sig // Check if protein molecules were even specified @@ -665,11 +665,11 @@ if (params.sketch_num_hashes_log2) summary['Sketch Sizes (log2)'] = params. if (params.sketch_scaled) summary['Sketch scaled'] = params.sketch_scaled if (params.sketch_scaled_log2) summary['Sketch scaled (log2)'] = params.sketch_scaled_log2 // 10x parameters -if(params.tenx_tgz) summary["10x .tgz"] = params.tenx_tgz -if(params.tenx_tgz) summary["10x SAM tags"] = params.tenx_tags -if(params.tenx_tgz) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern -if(params.tenx_tgz) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern -if(params.tenx_tgz) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell +if(params.tenx_tgz || params.bam) summary["10x .tgz"] = params.tenx_tgz +if(params.tenx_tgz || params.bam) summary["10x SAM tags"] = params.tenx_tags +if(params.tenx_tgz || params.bam) summary["10x Cell pattern"] = params.tenx_cell_barcode_pattern +if(params.tenx_tgz || params.bam) summary["10x UMI pattern"] = params.tenx_molecular_barcode_pattern +if(params.tenx_tgz || params.bam) summary['Min UMI/cell'] = params.tenx_min_umi_per_cell // Orpheum Translate parameters if(params.translate_proteome_fasta) summary["Orpheum Translate Peptide fasta"] = params.translate_proteome_fasta if(params.translate_proteome_fasta) summary['Orpheum Translate Peptide ksize'] = params.translate_peptide_ksize diff --git a/nextflow.config b/nextflow.config index ee47e75f..7bdc2902 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,7 +16,7 @@ params { fastas = false protein_fastas = false sra = false - + bam = false input = false // Parsing 10x bam files @@ -77,7 +77,7 @@ params { save_fastas = "fastas" tenx_min_umi_per_cell = '0' write_barcode_meta_csv = false - bam = false + // 10x optional input parameters set using the below pattern // https://github.com/nextflow-io/patterns/blob/master/docs/optional-input.adoc From f188f77cf5ebc5c043813e53d4d4540c116e8f4e Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 16 Mar 2021 14:50:52 -0700 Subject: [PATCH 42/43] reference proteome fasta to translate_proteome_fasta in test_constitutive_from_fasta --- conf/test_constitutive_from_fasta.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_constitutive_from_fasta.config b/conf/test_constitutive_from_fasta.config index 8be0ba00..ea757073 100644 --- a/conf/test_constitutive_from_fasta.config +++ b/conf/test_constitutive_from_fasta.config @@ -27,6 +27,6 @@ params { constitutive_protein_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.protein.fa__only_constitutive_genes.fa.gz" constitutive_rna_fasta = "https://github.com/czbiohub/test-datasets/raw/olgabot/kmermaid--housekeeping-fasta/reference/vertebrate_mammalian--205--2021-03-15.rna.fa__only_constitutive_genes.fa.gz" - reference_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa' + translate_proteome_fasta = 'https://github.com/nf-core/test-datasets/raw/kmermaid/reference/gencode.v32.pc_translations.subsample5.randomseed0.fa' bloomfilter_tablesize = '1e6' } From 793e9f199f1de956c9e7c2b1e753c789a605c103 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Tue, 16 Mar 2021 14:53:19 -0700 Subject: [PATCH 43/43] Don't fail fast for all tests to see which individual ones are failing --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7c95ece8..cdf23457 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -61,6 +61,7 @@ jobs: NXF_VER: '20.07.1' NXF_ANSI_LOG: false strategy: + fail-fast: false matrix: profile_flags: - "test --sketch_scaled false --sketch_scaled_log2 2"