From 968c0c28ea41608631856384b115499c7a517054 Mon Sep 17 00:00:00 2001 From: tdayris Date: Fri, 29 Mar 2024 16:35:52 +0100 Subject: [PATCH] 3.4.2 --- CHANGELOG.md | 7 + README.md | 42 ++--- workflow/reports/material_methods.rst | 16 +- workflow/rules/agat.smk | 40 ++--- workflow/rules/bcftools_filter_dbsnp.smk | 26 +-- workflow/rules/bedtools_merge_blacklist.smk | 3 +- workflow/rules/blacklist.smk | 16 +- workflow/rules/common.smk | 171 +++++++++++++++++++- workflow/rules/gffread.smk | 39 +---- workflow/rules/picard_dict.smk | 3 +- workflow/rules/pyfaidx.smk | 8 +- workflow/rules/pyroe.smk | 10 +- workflow/rules/samtools_faidx.smk | 3 +- workflow/rules/tabix.smk | 4 +- workflow/rules/transcript_to_gene.smk | 15 +- workflow/rules/ucsc.smk | 10 +- 16 files changed, 253 insertions(+), 160 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd438b0..bf9c641 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# 3.4.2 + +## Features: + +* Use human readable functions to replace raw lookups +* snakemake-wrappers update to 3.7.0 + # 3.4.1 ## Features: diff --git a/README.md b/README.md index 02bad79..3c6d142 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,10 @@ The tools used in this pipeline are described [here](https://github.com/tdayris/ | Step | Commands | | -------------------------------- | ---------------------------------------------------------------------------------------------------------------- | -| Download DNA Fasta from Ensembl | [ensembl-sequence](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/reference/ensembl-sequence.html) | +| Download DNA Fasta from Ensembl | [ensembl-sequence](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/reference/ensembl-sequence.html) | | Remove non-canonical chromosomes | [pyfaidx](https://github.com/mdshw5/pyfaidx) | -| Index DNA sequence | [samtools](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/samtools/faidx.html) | -| Creatse sequence Dictionary | [picard](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/picard/createsequencedictionary.html) | +| Index DNA sequence | [samtools](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/samtools/faidx.html) | +| Creatse sequence Dictionary | [picard](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/picard/createsequencedictionary.html) | ``` ┌────────────────────────────────────────┐ @@ -51,11 +51,11 @@ The tools used in this pipeline are described [here](https://github.com/tdayris/ | Step | Commands | | ---------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | -| Download GTF annotation | [ensembl-annotation](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/reference/ensembl-annotation.html) | -| Fix format errors | [Agat](https://agat.readthedocs.io/en/v3.5.2/tools/agat_convert_sp_gff2gtf.html) | -| Remove non-canonical chromosomes, based on above DNA Fasta | [Agat](https://agat.readthedocs.io/en/v3.5.2/tools/agat_sq_filter_feature_from_fasta.html) | -| Remove `` Transcript support levels | [Agat](https://agat.readthedocs.io/en/v3.5.2/tools/agat_sp_filter_feature_by_attribute_value.html) | -| Convert GTF to GenePred format | [gtf2genepred](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/ucsc/gtftogenepred.html) | +| Download GTF annotation | [ensembl-annotation](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/reference/ensembl-annotation.html) | +| Fix format errors | [Agat](https://agat.readthedocs.io/en/v3.7.0/tools/agat_convert_sp_gff2gtf.html) | +| Remove non-canonical chromosomes, based on above DNA Fasta | [Agat](https://agat.readthedocs.io/en/v3.7.0/tools/agat_sq_filter_feature_from_fasta.html) | +| Remove `` Transcript support levels | [Agat](https://agat.readthedocs.io/en/v3.7.0/tools/agat_sp_filter_feature_by_attribute_value.html) | +| Convert GTF to GenePred format | [gtf2genepred](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/ucsc/gtftogenepred.html) | ``` @@ -89,9 +89,9 @@ The tools used in this pipeline are described [here](https://github.com/tdayris/ | Step | Commands | | --------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | -| Extract transcript sequences from above DNA Fasta and GTF | [gffread](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/gffread.html) | -| Index DNA sequence | [samtools](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/samtools/faidx.html) | -| Creatse sequence Dictionary | [picard](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/picard/createsequencedictionary.html) | +| Extract transcript sequences from above DNA Fasta and GTF | [gffread](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/gffread.html) | +| Index DNA sequence | [samtools](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/samtools/faidx.html) | +| Creatse sequence Dictionary | [picard](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/picard/createsequencedictionary.html) | ``` @@ -115,10 +115,10 @@ The tools used in this pipeline are described [here](https://github.com/tdayris/ | Step | Commands | | ----------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | -| Extract coding transcripts from above GTF | [Agat](https://agat.readthedocs.io/en/v3.5.2/tools/agat_sp_filter_feature_by_attribute_value.html) | -| Extract coding sequences from above DNA Fasta and GTF | [gffread](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/gffread.html) | -| Index DNA sequence | [samtools](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/samtools/faidx.html) | -| Creatse sequence Dictionary | [picard](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/picard/createsequencedictionary.html) | +| Extract coding transcripts from above GTF | [Agat](https://agat.readthedocs.io/en/v3.7.0/tools/agat_sp_filter_feature_by_attribute_value.html) | +| Extract coding sequences from above DNA Fasta and GTF | [gffread](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/gffread.html) | +| Index DNA sequence | [samtools](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/samtools/faidx.html) | +| Creatse sequence Dictionary | [picard](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/picard/createsequencedictionary.html) | ``` @@ -142,9 +142,9 @@ The tools used in this pipeline are described [here](https://github.com/tdayris/ | Step | Commands | | -------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | -| Download dbSNP variants | [ensembl-variation](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/reference/ensembl-variation.html) | -| Filter non-canonical chromosomes | [pyfaidx](https://github.com/mdshw5/pyfaidx) + [BCFTools](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/bcftools/filter.html) | -| Index variants | [tabix](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/tabix/index.html) | +| Download dbSNP variants | [ensembl-variation](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/reference/ensembl-variation.html) | +| Filter non-canonical chromosomes | [pyfaidx](https://github.com/mdshw5/pyfaidx) + [BCFTools](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/bcftools/filter.html) | +| Index variants | [tabix](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/tabix/index.html) | ``` @@ -168,8 +168,8 @@ The tools used in this pipeline are described [here](https://github.com/tdayris/ | Step | Commands | | ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Extract gene_id <-> gene_name correspondancy | [pyroe](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/pyroe/idtoname.html) | -| Extract transcript_id <-> gene_id <-> gene_name | [Agat](https://agat.readthedocs.io/en/v3.5.2/tools/agat_convert_sp_gff2tsv.html) + [XSV](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/xsv.html) | +| Extract gene_id <-> gene_name correspondancy | [pyroe](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/pyroe/idtoname.html) | +| Extract transcript_id <-> gene_id <-> gene_name | [Agat](https://agat.readthedocs.io/en/v3.7.0/tools/agat_convert_sp_gff2tsv.html) + [XSV](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/xsv.html) | ``` ┌────────────────────────────────┐ @@ -193,7 +193,7 @@ The tools used in this pipeline are described [here](https://github.com/tdayris/ | Step | Commands | | ---------------------------- | -------------------------------------------------------------------------------------------- | | Download blacklisted regions | [Github source](https://github.com/Boyle-Lab/Blacklist/tree/master/lists) | -| Merge overlapping intervals | [bedtools](https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/bedtools/merge.html) | +| Merge overlapping intervals | [bedtools](https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/bedtools/merge.html) | ``` diff --git a/workflow/reports/material_methods.rst b/workflow/reports/material_methods.rst index 0b399b4..2b05dc4 100644 --- a/workflow/reports/material_methods.rst +++ b/workflow/reports/material_methods.rst @@ -46,17 +46,17 @@ usage, and resutls can be found on the `Snakemake workflow`_ page. .. _Snakemake: https://snakemake.readthedocs.io .. _Github: https://github.com/tdayris/fair_genome_indexer .. _`Snakemake workflow`: https://snakemake.github.io/snakemake-workflow-catalog?usage=tdayris/fair_genome_indexer -.. _Picard: https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/picard/createsequencedictionary.html -.. _Samtools: https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/samtools/faidx.html +.. _Picard: https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/picard/createsequencedictionary.html +.. _Samtools: https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/samtools/faidx.html .. _Agat: https://agat.readthedocs.io/en/latest/index.html -.. _Pyroe: https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/pyroe/idtoname.html +.. _Pyroe: https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/pyroe/idtoname.html .. _Pyfaidx: https://github.com/mdshw5/pyfaidx -.. _GFFRead: https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/gffread.html -.. _XSV: https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/xsv.html -.. _BCFTools: https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/bcftools/filter.html -.. _Tabix: https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/tabix/index.html +.. _GFFRead: https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/gffread.html +.. _XSV: https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/xsv.html +.. _BCFTools: https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/bcftools/filter.html +.. _Tabix: https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/tabix/index.html .. _`Boyle-Lab's Github`: https://github.com/Boyle-Lab/Blacklist -.. _BEDTools: https://snakemake-wrappers.readthedocs.io/en/v3.5.2/wrappers/bedtools/merge.html +.. _BEDTools: https://snakemake-wrappers.readthedocs.io/en/v3.7.0/wrappers/bedtools/merge.html .. _UCSC: https://genome.ucsc.edu/FAQ/FAQformat.html :Authors: diff --git a/workflow/rules/agat.smk b/workflow/rules/agat.smk index 918390a..9f45c93 100644 --- a/workflow/rules/agat.smk +++ b/workflow/rules/agat.smk @@ -11,9 +11,8 @@ rule fair_genome_indexer_agat_config: benchmark: "benchmark/fair_genome_indexer/agat_config.tsv" params: - config=dlookup( + config=lookup_config( dpath="params/fair_genome_indexer/agat/config", - within=config, default={ "output_format": "GTF", "gff_output_version": 3, @@ -68,9 +67,7 @@ rule fair_genome_indexer_agat_convert_sp_gff2gtf: benchmark: "benchmark/fair_genome_indexer/agat_convert_sp_gff2gtf/{species}.{build}.{release}.tsv" params: - extra=dlookup( - dpath="params/fair_genome_indexer/agat/gff2gtf", within=config, default="" - ), + extra=lookup_config(dpath="params/fair_genome_indexer/agat/gff2gtf", default=""), conda: "../envs/agat.yaml" script: @@ -103,9 +100,8 @@ rule fair_genome_indexer_agat_sp_filter_feature_by_attribute_value: benchmark: "benchmark/fair_genome_indexer/agat_sp_filter_feature_by_attribute_value/{species}.{build}.{release}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/agat/select_feature_by_attribute_value", - within=config, default="--attribute 'transcript_support_level' --value '\"NA\"' --test '='", ), conda: @@ -117,25 +113,14 @@ rule fair_genome_indexer_agat_sp_filter_feature_by_attribute_value: rule fair_genome_indexer_agat_sq_filter_feature_from_fasta: input: gtf=branch( - dlookup( + lookup_config( dpath="params/fair_genome_indexer/agat/select_feature_by_attribute_value", - within=config, ), then="tmp/fair_genome_indexer/agat_sp_filter_feature_by_attribute_value/{species}.{build}.{release}.filtered.gtf", otherwise="tmp/fair_genome_indexer/agat_convert_sp_gff2gtf/{species}.{build}.{release}.format.gtf", ), - fasta=dlookup( - default="reference/sequences/{species}.{build}.{release}.dna.fasta", - query="species == '{species}' & build == '{build} & release == '{release}'", - key="dna_fasta", - within=genomes, - ), - fasta_index=dlookup( - query="species == '{species}' & build == '{build} & release == '{release}'", - key="dna_fai", - within=genomes, - default="reference/sequences/{species}.{build}.{release}.dna.fasta.fai", - ), + fasta=lambda wildcards: get_dna_fasta(wildcards), + fasta_index=lambda wildcards: get_dna_fai(wildcards), config="tmp/fair_genome_indexer/agat_config/config.yaml", output: gtf="reference/annotation/{species}.{build}.{release}.gtf", @@ -151,9 +136,8 @@ rule fair_genome_indexer_agat_sq_filter_feature_from_fasta: benchmark: "benchmark/fair_genome_indexer/agat_sq_filter_feature_from_fasta/{species}.{build}.{release}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/agat/filter_features", - within=config, default="", ), conda: @@ -164,12 +148,7 @@ rule fair_genome_indexer_agat_sq_filter_feature_from_fasta: use rule fair_genome_indexer_agat_sp_filter_feature_by_attribute_value as fair_genome_indexer_agat_sp_filter_feature_by_attribute_value_cdna with: input: - gtf=dlookup( - query="species == '{species} & release == '{release}' & build == '{build}'", - within=genomes, - key="gtf", - default="reference/annotation/{species}.{build}.{release}.gtf", - ), + gtf=lambda wildcards: get_gtf(wildcards), config="tmp/fair_genome_indexer/agat_config/config.yaml", output: gtf=temp( @@ -186,8 +165,7 @@ use rule fair_genome_indexer_agat_sp_filter_feature_by_attribute_value as fair_g benchmark: "benchmark/fair_genome_indexer/agat_sp_filter_feature_by_attribute_value_cdna/{species}.{build}.{release}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/agat/filter_feature_by_attribute_value", - within=config, default="--attribute transcript_biotype --value '\"protein_coding\"' --test '='", ), diff --git a/workflow/rules/bcftools_filter_dbsnp.smk b/workflow/rules/bcftools_filter_dbsnp.smk index d4ded15..cf45f7f 100644 --- a/workflow/rules/bcftools_filter_dbsnp.smk +++ b/workflow/rules/bcftools_filter_dbsnp.smk @@ -1,20 +1,10 @@ rule fair_genome_indexer_pyfaidx_fasta_dict_to_bed: input: - fasta=dlookup( - query="species == '{species}' & build == '{build}' & release == '{release}'", - within=genomes, - key="dna_fasta", - default="reference/sequences/{species}.{build}.{release}.dna.fasta", - ), - fai=dlookup( - query="species == '{species}' & build == '{build}' & release == '{release}'", - within=genomes, - key="dna_fai", - default="reference/sequences/{species}.{build}.{release}.dna.fasta.fai", - ), + fasta=lambda wildcards: select_fasta(wildcards), + fai=lambda wildcards: select_fai(wildcards), output: temp( - "tmp/fair_genome_indexer/pyfaidx_fasta_dict_to_bed/{species}.{build}.{release}.dna.bed" + "tmp/fair_genome_indexer/pyfaidx_fasta_dict_to_bed/{species}.{build}.{release}.{datatype}.bed" ), threads: 1 resources: @@ -22,13 +12,12 @@ rule fair_genome_indexer_pyfaidx_fasta_dict_to_bed: runtime=lambda wildcards, attempt: 5 * attempt, tmpdir=tmp, log: - "logs/fair_genome_indexer/pyfaidx_fasta_dict_to_bed/{species}.{build}.{release}.dna.log", + "logs/fair_genome_indexer/pyfaidx_fasta_dict_to_bed/{species}.{build}.{release}.{datatype}.log", benchmark: - "benchmark/fair_genome_indexer/pyfaidx_fasta_dict_to_bed/{species}.{build}.{release}.dna.tsv" + "benchmark/fair_genome_indexer/pyfaidx_fasta_dict_to_bed/{species}.{build}.{release}.{datatype}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/pyfaidx/fasta_dict_to_bed", - within=config, default="", ), conda: @@ -58,9 +47,8 @@ rule fair_genome_indexer_bcftools_filter_non_canonical_chrom: benchmark: "benchmark/fair_genome_indexer/bcftools_filter_non_canonical_chrom/{species}.{build}.{release}.all.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/bedtools/filter_non_canonical_chrom", - within=config, default="", ), wrapper: diff --git a/workflow/rules/bedtools_merge_blacklist.smk b/workflow/rules/bedtools_merge_blacklist.smk index f36109a..0db919c 100644 --- a/workflow/rules/bedtools_merge_blacklist.smk +++ b/workflow/rules/bedtools_merge_blacklist.smk @@ -13,9 +13,8 @@ rule fair_genome_indexer_bedtools_merge_blacklist: benchmark: "benchmark/fair_genome_indexer/bedtools_merge_blacklist/{species}.{build}.{release}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/bedtools/merge", - within=config, default="-d 5", ), wrapper: diff --git a/workflow/rules/blacklist.smk b/workflow/rules/blacklist.smk index def9387..0d5cb7e 100644 --- a/workflow/rules/blacklist.smk +++ b/workflow/rules/blacklist.smk @@ -13,8 +13,8 @@ rule fair_genome_indexer_blacklist_grch38: "benchmark/fair_genome_indexer/blacklist/homo_sapiens.GRCh38.{release}.tsv" params: address="https://github.com/Boyle-Lab/Blacklist/raw/master/lists/Blacklist_v1/hg38-blacklist.bed.gz", - extra=dlookup( - dpath="params/fair_genome_indexer/wget", within=config, default="--verbose" + extra=lookup_config( + dpath="params/fair_genome_indexer/wget", default="--verbose" ), conda: "../envs/bash.yaml" @@ -32,8 +32,8 @@ use rule fair_genome_indexer_blacklist_grch38 as fair_genome_indexer_blacklist_m "benchmark/fair_genome_indexer/blacklist/mus_musculus.GRCm38.{release}.tsv" params: address="https://github.com/Boyle-Lab/Blacklist/raw/master/lists/Blacklist_v1/mm10-blacklist.bed.gz", - extra=dlookup( - dpath="params/fair_genome_indexer/wget", within=config, default="--verbose" + extra=lookup_config( + dpath="params/fair_genome_indexer/wget", default="--verbose" ), @@ -47,8 +47,8 @@ use rule fair_genome_indexer_blacklist_grch38 as fair_genome_indexer_blacklist_g "logs/fair_genome_indexer/blacklist/homo_sapiens.GRCh37.{release}.log", params: address="https://github.com/Boyle-Lab/Blacklist/raw/master/lists/Blacklist_v1/hg19-blacklist.bed.gz", - extra=dlookup( - dpath="params/fair_genome_indexer/wget", within=config, default="--verbose" + extra=lookup_config( + dpath="params/fair_genome_indexer/wget", default="--verbose" ), @@ -62,6 +62,6 @@ use rule fair_genome_indexer_blacklist_grch38 as fair_genome_indexer_blacklist_m "benchmark/fair_genome_indexer/blacklist/mus_musculus.NCBIM37.{release}.tsv" params: address="https://github.com/Boyle-Lab/Blacklist/blob/master/lists/Blacklist_v1/mm9-blacklist.bed.gz", - extra=dlookup( - dpath="params/fair_genome_indexer/wget", within=config, default="--verbose" + extra=lookup_config( + dpath="params/fair_genome_indexer/wget", default="--verbose" ), diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index e8eaced..437e964 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -33,7 +33,7 @@ genomes: pandas.DataFrame = pandas.read_csv( ) snakemake.utils.validate(genomes, "../schemas/genomes.schema.yaml") -snakemake_wrappers_prefix: str = "v3.5.2" +snakemake_wrappers_prefix: str = "v3.7.0" report: "../reports/workflow.rst" @@ -111,6 +111,175 @@ def dlookup( return value +def lookup_config( + dpath: str, default: str | None = None, config: dict[str, Any] = config +) -> str: + """ + Run lookup function with default parameters in order to search a key in configuration and return a default value + """ + value: str | None = default + + try: + value = lookup(dpath=dpath, within=config) + except LookupError: + value = default + except WorkflowError: + value = default + + return value + + +def lookup_genomes( + wildcards: snakemake.io.Wildcards, + key: str, + default: str | list[str] | None = None, + genomes: pandas.DataFrame = genomes, +) -> str: + """ + Run lookup function with default parameters in order to search user-provided sequence/annotation files + """ + query: str = ( + "species == '{wildcards.species}' & build == '{wildcards.build}' & release == '{wildcards.release}'".format( + wildcards=wildcards + ) + ) + return getattr(lookup(query=query, within=genomes), key, default) + + +def get_dna_fasta( + wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes +) -> str: + """ + Return path to the final DNA fasta sequences + """ + default: str = ( + "reference/sequences/{wildcards.species}.{wildcards.build}.{wildcards.release}.dna.fasta".format( + wildcards=wildcards + ) + ) + return lookup_genomes(wildcards, key="dna_fasta", default=default, genomes=genomes) + + +def get_cdna_fasta( + wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes +) -> str: + """ + Return path to the final cDNA fasta sequences + """ + default: str = ( + "reference/sequences/{wildcards.species}.{wildcards.build}.{wildcards.release}.cdna.fasta".format( + wildcards=wildcards + ) + ) + return lookup_genomes(wildcards, key="cdna_fasta", default=default, genomes=genomes) + + +def get_transcripts_fasta( + wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes +) -> str: + """ + Return path to the final cDNA transcripts fasta sequences + """ + default: str = ( + "reference/sequences/{wildcards.species}.{wildcards.build}.{wildcards.release}.transcripts.fasta".format( + wildcards=wildcards + ) + ) + return lookup_genomes( + wildcards, key="transcripts_fasta", default=default, genomes=genomes + ) + + +def select_fasta( + wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes +) -> str: + """ + Evaluates the {datatype} wildcard, and return the right fasta file + """ + return branch( + condition=str(wildcards.datatype).lower(), + cases={ + "dna": get_dna_fasta(wildcards), + "cdna": get_cdna_fasta(wildcards), + "transcripts": get_transcripts_fasta(wildcards), + }, + ) + + +def get_dna_fai( + wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes +) -> str: + """ + Return path to the final DNA fasta sequences index + """ + default: str = ( + "reference/sequences/{wildcards.species}.{wildcards.build}.{wildcards.release}.dna.fasta.fai".format( + wildcards=wildcards + ) + ) + return lookup_genomes(wildcards, key="dna_fai", default=default, genomes=genomes) + + +def get_cdna_fai( + wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes +) -> str: + """ + Return path to the final cDNA fasta sequences index + """ + default: str = ( + "reference/sequences/{wildcards.species}.{wildcards.build}.{wildcards.release}.cdna.fasta.fai".format( + wildcards=wildcards + ) + ) + return lookup_genomes(wildcards, key="cdna_fai", default=default, genomes=genomes) + + +def get_transcripts_fai( + wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes +) -> str: + """ + Return path to the final cDNA transcripts fasta sequences index + """ + default: str = ( + "reference/sequences/{wildcards.species}.{wildcards.build}.{wildcards.release}.transcripts.fasta.fai".format( + wildcards=wildcards + ) + ) + return lookup_genomes( + wildcards, key="transcripts_fai", default=default, genomes=genomes + ) + + +def select_fai( + wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes +) -> str: + """ + Evaluates the {datatype} wildcard, and return the right fasta index file + """ + return branch( + condition=str(wildcards.datatype).lower(), + cases={ + "dna": get_dna_fai(wildcards), + "cdna": get_cdna_fai(wildcards), + "transcripts": get_transcripts_fai(wildcards), + }, + ) + + +def get_gtf( + wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes +) -> str: + """ + Return path to the final genome annotation + """ + default: str = ( + "reference/annotation/{wildcards.species}.{wildcards.build}.{wildcards.release}.gtf".format( + wildcards=wildcards + ) + ) + return lookup_genomes(wildcards, key="gtf", default=default, genomes=genomes) + + def get_fair_genome_indexer_target( wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes ) -> dict[str, list[str] | str]: diff --git a/workflow/rules/gffread.smk b/workflow/rules/gffread.smk index 56c3c59..6d458b2 100644 --- a/workflow/rules/gffread.smk +++ b/workflow/rules/gffread.smk @@ -1,23 +1,8 @@ rule fair_genome_indexer_gffread_transcripts: input: - fasta=dlookup( - query="species == '{species}' & build == '{build}' & release == '{release}'", - within=genomes, - key="dna_fasta", - default="reference/sequences/{species}.{build}.{release}.dna.fasta", - ), - fai=dlookup( - query="species == '{species}' & build == '{build}' & release == '{release}'", - within=genomes, - key="dna_fai", - default="reference/sequences/{species}.{build}.{release}.dna.fasta.fai", - ), - annotation=dlookup( - query="species == '{species} & release == '{release}' & build == '{build}'", - within=genomes, - key="gtf", - default="reference/annotation/{species}.{build}.{release}.gtf", - ), + fasta=lambda wildcards: get_dna_fasta(wildcards), + fai=lambda wildcards: get_dna_fai(wildcards), + annotation=lambda wildcards: get_gtf(wildcards), output: records="reference/sequences/{species}.{build}.{release}.transcripts.fasta", threads: 1 @@ -30,27 +15,15 @@ rule fair_genome_indexer_gffread_transcripts: benchmark: "benchmark/fair_genome_indexer/gffread_transcripts/{species}.{build}.{release}.transcripts.tsv" params: - extra=dlookup( - dpath="params/fair_genome_indexer/gffread", within=config, default="" - ), + extra=lookup_config(dpath="params/fair_genome_indexer/gffread", default=""), wrapper: f"{snakemake_wrappers_prefix}/bio/gffread" use rule fair_genome_indexer_gffread_transcripts as fair_genome_indexer_gffread_cdna with: input: - fasta=dlookup( - query="species == '{species}' & build == '{build}' & release == '{release}'", - within=genomes, - key="dna_fasta", - default="reference/sequences/{species}.{build}.{release}.dna.fasta", - ), - fai=dlookup( - query="species == '{species}' & build == '{build}' & release == '{release}'", - within=genomes, - key="dna_fai", - default="reference/sequences/{species}.{build}.{release}.dna.fasta.fai", - ), + fasta=lambda wildcards: get_dna_fasta(wildcards), + fai=lambda wildcards: get_dna_fai(wildcards), annotation="tmp/fair_genome_indexer/agat_sp_filter_feature_by_attribute_value_cdna/{species}.{build}.{release}.cdna.gtf", output: records="reference/sequences/{species}.{build}.{release}.cdna.fasta", diff --git a/workflow/rules/picard_dict.smk b/workflow/rules/picard_dict.smk index 4b4a491..831a7c2 100644 --- a/workflow/rules/picard_dict.smk +++ b/workflow/rules/picard_dict.smk @@ -13,9 +13,8 @@ rule fair_genome_indexer_picard_create_dict: benchmark: "benchmark/fair_genome_indexer/picard_create_dict/{species}.{build}.{release}.{datatype}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/picard/createsequencedictionary", - within=config, default="", ), wrapper: diff --git a/workflow/rules/pyfaidx.smk b/workflow/rules/pyfaidx.smk index e2f1ffc..a63e78b 100644 --- a/workflow/rules/pyfaidx.smk +++ b/workflow/rules/pyfaidx.smk @@ -19,7 +19,10 @@ rule fair_genome_indexer_pyfaidx_filter_out_noncanonical_chromosomes: benchmark: "benchmark/fair_genome_indexer/pyfaidx_filter_out_noncanonical_chromosomes/{species}.{build}.{release}.{datatype}.tsv" params: - extra=lambda w: config.get("params", {}).get("pyfaidx", {}).get(w.datatype, ""), + extra=lambda wildcards: lookup_config( + dpath=f"params/fair_genome_indexer/pydaidx/{wildcards.datatype}", + default="", + ), conda: "../envs/pyfaidx.yaml" script: @@ -45,9 +48,8 @@ rule fair_genome_indexer_rsync_make_fasta_available: benchmark: "benchmark/fair_genome_indexer/rsync_make_fasta_available/{species}.{build}.{release}.dna.fasta.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/rsync", - within=config, default="--verbose --checksum --force --human-readable --progress", ), conda: diff --git a/workflow/rules/pyroe.smk b/workflow/rules/pyroe.smk index d0027ce..70087a9 100644 --- a/workflow/rules/pyroe.smk +++ b/workflow/rules/pyroe.smk @@ -1,11 +1,6 @@ rule fair_genome_indexer_pyroe_id_to_name: input: - dlookup( - query="species == '{species} & release == '{release}' & build == '{build}'", - within=genomes, - key="gtf", - default="reference/annotation/{species}.{build}.{release}.gtf", - ), + lambda wildcards: get_gtf(wildcards), output: "reference/annotation/{species}.{build}.{release}.id_to_gene.tsv", threads: 1 @@ -18,9 +13,8 @@ rule fair_genome_indexer_pyroe_id_to_name: benchmark: "benchmark/fair_genome_indexer/pyroe_id_to_name/{species}.{build}.{release}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/pyroe/idtoname", - within=config, default="", ), wrapper: diff --git a/workflow/rules/samtools_faidx.smk b/workflow/rules/samtools_faidx.smk index 4168103..8e8de50 100644 --- a/workflow/rules/samtools_faidx.smk +++ b/workflow/rules/samtools_faidx.smk @@ -13,9 +13,8 @@ rule fair_genome_indexer_samtools_index: benchmark: "benchmark/fair_genome_indexer/samtools_index/{species}.{build}.{release}.{datatype}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/samtools/faidx", - within=config, default="", ), wrapper: diff --git a/workflow/rules/tabix.smk b/workflow/rules/tabix.smk index 13d5677..7a1c325 100644 --- a/workflow/rules/tabix.smk +++ b/workflow/rules/tabix.smk @@ -13,9 +13,7 @@ rule fair_genome_indexer_tabix_index_dbsnp: benchmark: "benchmark/fair_genome_indexer/tabix/index/{species}.{build}.{release}.all.tsv" params: - extra=dlookup( - dpath="params/fair_genome_indexer/tabix", within=config, default="-p vcf" - ), + extra=lookup_config(dpath="params/fair_genome_indexer/tabix", default="-p vcf"), wrapper: f"{snakemake_wrappers_prefix}/bio/tabix/index" diff --git a/workflow/rules/transcript_to_gene.smk b/workflow/rules/transcript_to_gene.smk index e99d9d2..ba9232e 100644 --- a/workflow/rules/transcript_to_gene.smk +++ b/workflow/rules/transcript_to_gene.smk @@ -1,11 +1,6 @@ rule fair_genome_indexer_agat_convert_sp_gff2tsv: input: - gtf=dlookup( - query="species == '{species} & release == '{release}' & build == '{build}'", - within=genomes, - key="gtf", - default="reference/annotation/{species}.{build}.{release}.gtf", - ), + gtf=lambda wildcards: get_gtf(wildcards), config="tmp/fair_genome_indexer/agat_config/config.yaml", output: tsv=temp("tmp/fair_genome_indexer/agat/{species}.{build}.{release}.t2g.tsv"), @@ -21,9 +16,8 @@ rule fair_genome_indexer_agat_convert_sp_gff2tsv: benchmark: "benchmark/fair_genome_indexer/agat/agat_convert_sp_gff2tsv/{species}.{build}.{release}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/agat/agat_convert_sp_gff2tsv", - within=config, default="", ), conda: @@ -48,7 +42,7 @@ rule fair_genome_indexer_xsv_select_t2g_columns: "benchmark/fair_genome_indexer/xsv/select_columns/{species}.{build}.{release}.tsv" params: subcommand="select", - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/xsv/select_t2g_columns", default="transcript_id,gene_id,gene_name", ), @@ -67,8 +61,7 @@ use rule fair_genome_indexer_xsv_select_t2g_columns as fair_genome_indexer_xsv_f "benchmark/fair_genome_indexer/xsv/fmt/{species}.{build}.{release}.tsv" params: subcommand="fmt", - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/xsv/fmt_t2g", - within=config, default="--out-delimiter $'\t'", ), diff --git a/workflow/rules/ucsc.smk b/workflow/rules/ucsc.smk index f237c18..8716e8b 100644 --- a/workflow/rules/ucsc.smk +++ b/workflow/rules/ucsc.smk @@ -1,11 +1,6 @@ rule fair_genome_indexer_ucsc_gtf_to_genepred: input: - dlookup( - query="species == '{species} & release == '{release}' & build == '{build}'", - within=genomes, - key="gtf", - default="reference/annotation/{species}.{build}.{release}.gtf", - ), + lambda wildcards: get_gtf(wildcards), output: "reference/annotation/{species}.{build}.{release}.genePred", threads: 1 @@ -18,9 +13,8 @@ rule fair_genome_indexer_ucsc_gtf_to_genepred: benchmark: "benchmark/fair_genome_indexer/ucsc_gtf_to_genepred/{species}.{build}.{release}.tsv" params: - extra=dlookup( + extra=lookup_config( dpath="params/fair_genome_indexer/ucsc/gtf2genepred", - within=config, default="", ), wrapper: