Skip to content

Commit

Permalink
3.8.0
Browse files Browse the repository at this point in the history
  • Loading branch information
tdayris committed Jul 9, 2024
1 parent 7f89506 commit b743f27
Show file tree
Hide file tree
Showing 12 changed files with 255 additions and 25 deletions.
2 changes: 2 additions & 0 deletions .test/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ snakefiles := \
"../workflow/rules/picard_dict.smk" \
"../workflow/rules/pyfaidx.smk" \
"../workflow/rules/pyroe.smk" \
"../workflow/rules/salmon_index.smk" \
"../workflow/rules/samtools_faidx.smk" \
"../workflow/rules/star_index.smk" \
"../workflow/rules/tabix.smk" \
"../workflow/rules/transcript_to_gene.smk" \
"../workflow/rules/ucsc.smk" \
Expand Down
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# 3.8.0

## Features:

* STAR indexes
* Salmon index
* Genepred file formats
* Re-use of indexed files

# 3.7.0

## Fix:
Expand Down
2 changes: 2 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ include: "rules/gffread.smk"
include: "rules/picard_dict.smk"
include: "rules/pyfaidx.smk"
include: "rules/pyroe.smk"
include: "rules/salmon_index.smk"
include: "rules/samtools_faidx.smk"
include: "rules/star_index.smk"
include: "rules/tabix.smk"
include: "rules/transcript_to_gene.smk"
include: "rules/ucsc.smk"
Expand Down
40 changes: 40 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,20 @@ def get_gff(
return lookup_genomes(wildcards, key="gff3", default=default, genomes=genomes)


def get_genepred(
wildcards: snakemake.io.Wildcards, genomes: pandas.DataFrame = genomes
) -> str:
"""
Return correct path to genepred file
"""
default: str = (
f"reference/annotation/{wildcards.species}.{wildcards.build}.{wildcards.release}/{wildcards.species}.{wildcards.build}.{wildcards.release}.genePred".format(
wildcards
),
)
return lookup_genomes(wildcards, key="genepred", default=default, genomes=genomes)


def used_genomes(
genomes: pandas.DataFrame = genomes, samples: pandas.DataFrame | None = None
) -> tuple[str]:
Expand Down Expand Up @@ -370,6 +384,10 @@ def get_fair_genome_indexer_target(
"reference/annotation/{genomes_property}/{genomes_property}.genePred",
genomes_property=genomes_properties,
),
"genepred_bed": expand(
"reference/annotation/{genomes_property}/{genomes_property}.genePred.bed",
genomes_property=genomes_properties,
),
"bowtie2_index": expand(
"reference/bowtie2_index/{genomes_property}.{datatype}/{genomes_property}.{datatype}{bt2_ext}",
genomes_property=genomes_properties,
Expand All @@ -383,6 +401,28 @@ def get_fair_genome_indexer_target(
".rev.2.bt2",
),
),
"star_index": expand(
"reference/star_index/{genomes_property}.{datatype}",
genomes_property=genomes_properties,
datatype=("dna", "cdna", "transcripts"),
),
"salmon_index": expand(
"reference/salmon_index/{genomes_property}/{genomes_property}/{salmon_ext}",
genomes_property=genomes_properties,
salmon_ext=(
"complete_ref_lens.bin",
"ctable.bin",
"ctg_offsets.bin",
"duplicate_clusters.tsv",
"info.json",
"mphf.bin",
"pos.bin",
"pre_indexing.log",
"rank.bin",
"refAccumLengths.bin",
"ref_indexing.log",
),
),
}

# Public blacklist are not available for all genomes
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/picard_dict.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ for grch38

rule fair_genome_indexer_picard_create_dict:
input:
"reference/sequences/{species}.{build}.{release}/{species}.{build}.{release}.{datatype}.fasta",
lambda wildcards: select_fasta(wildcards),
output:
"reference/sequences/{species}.{build}.{release}/{species}.{build}.{release}.{datatype}.dict",
threads: 1
Expand Down
60 changes: 60 additions & 0 deletions workflow/rules/salmon_index.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
rule fair_genome_indexer_salmon_decoy_sequences:
input:
transcriptome=lambda wildcards: get_transcripts_fasta(wildcards),
genome=lambda wildcards: get_dna_fasta(wildcards),
output:
gentrome=temp("reference/sequences/{species}.{build}.{release}.gentrome.fasta"),
decoys=temp("reference/sequences/{species}.{build}.{release}.decoys.txt"),
threads: 2
resources:
mem_mb=lambda wildcards, attempt: 512 * attempt,
runtime=lambda wildcards, attempt: 25 * attempt,
tmpdir=tmp,
log:
"logs/fair_genome_indexer_salmon_decoy_sequences/{species}.{build}.{release}.log",
benchmark:
"benchmark/fair_genome_indexer_salmon_decoy_sequences/{species}.{build}.{release}.tsv"
wrapper:
f"{snakemake_wrappers_prefix}/bio/salmon/decoys"


rule fair_genome_indexer_salmon_index_gentrome:
input:
sequences="reference/sequences/{species}.{build}.{release}.gentrome.fasta",
decoys="reference/sequences/{species}.{build}.{release}.decoys.txt",
output:
temp(
multiext(
"reference/salmon_index/{species}.{build}.{release}/{species}.{build}.{release}/",
"complete_ref_lens.bin",
"ctable.bin",
"ctg_offsets.bin",
"duplicate_clusters.tsv",
"info.json",
"mphf.bin",
"pos.bin",
"pre_indexing.log",
"rank.bin",
"refAccumLengths.bin",
"ref_indexing.log",
"reflengths.bin",
"refseq.bin",
"seq.bin",
"versionInfo.json",
)
),
threads: 20
resources:
mem_mb=lambda wildcards, attempt: 48 * 1024 * attempt,
runtime=lambda wildcards, attempt: 50 * attempt,
tmpdir=tmp,
log:
"logs/fair_genome_indexer_salmon_index_gentrome/{species}.{build}.{release}.log",
benchmark:
"benchmark/fair_genome_indexer_salmon_index_gentrome/{species}.{build}.{release}.tsv"
params:
extra=lookup_config(
dpath="params/fair_genome_indexer_salmon_index_gentromee", default=""
),
wrapper:
f"{snakemake_wrappers_prefix}/bio/salmon/index"
2 changes: 1 addition & 1 deletion workflow/rules/samtools_faidx.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ for grch38

rule fair_genome_indexer_samtools_index:
input:
"reference/sequences/{species}.{build}.{release}/{species}.{build}.{release}.{datatype}.fasta",
lambda wildcards: select_fasta(wildcards),
output:
"reference/sequences/{species}.{build}.{release}/{species}.{build}.{release}.{datatype}.fasta.fai",
threads: 1
Expand Down
22 changes: 22 additions & 0 deletions workflow/rules/star_index.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
rule fair_genome_indexer_star_index:
input:
fasta=lambda wildcards: select_fasta(wildcards),
fai=lambda wildcards: select_fai(wildcards),
output:
directory("reference/star_index/{species}.{build}.{release}.{datatype}"),
threads: 20
resources:
mem_mb=lambda wildcards, attempt: attempt * 5_000 + 45_000,
runtime=lambda wildcards, attempt: attempt * 30 + 60,
tmpdir=tmp,
log:
"logs/fair_genome_indexer_star_index/{species}.{build}.{release}.{datatype}/index.log",
benchmark:
"benchmark/fair_genome_indexer_star_index/{species}.{build}.{release}.{datatype}/index.tsv"
params:
extra=lookup_config(
dpath="params/fair_genome_indexer_star_index",
default="",
),
wrapper:
f"{snakemake_wrappers_prefix}/bio/star/index"
23 changes: 23 additions & 0 deletions workflow/rules/ucsc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,26 @@ rule fair_genome_indexer_ucsc_gtf_to_genepred:
),
wrapper:
f"{snakemake_wrappers_prefix}/bio/ucsc/gtfToGenePred"


rule fair_genome_indexer_ucsc_genepred_to_bed:
input:
lambda wildcards: get_genepred(wildcards),
output:
"reference/annotation/{species}.{build}.{release}/{species}.{build}.{release}.genePred.bed",
threads: 1
resources:
mem_mb=lambda wildcards, attempt: 1_000 * (500 * attempt),
runtime=lambda wildcards, attempt: attempt * 10,
tmpdir=tmp,
log:
"logs/fair_genome_indexer_ucsc_genepred_to_bed/{species}.{build}.{release}.log",
benchmark:
"benchmark/fair_genome_indexer_ucsc_genepred_to_bed/{species}.{build}.{release}.tsv"
params:
extra=lookup_config(
dpath="params/fair_genome_indexer_ucsc_genepred_to_bed",
default="",
),
wrapper:
f"{snakemake_wrappers_prefix}/bio/ucsc/genePredToBed"
58 changes: 35 additions & 23 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,98 +13,110 @@ params:
type: object
description: Optional parameters

fair_genome_indexer_star_index:
type: string
description: Optional parameters for STAR index

fair_genome_indexer_bowtie2_index:
type: string
description: Optional parameters for bowtie2 build

fair_genome_indexer_ucsc_genepred_to_bed:
type: string
description: Optional parameters for ucsc genePredToBed

fair_genome_indexer_ucsc_gtf_to_genepred:
type: str
type: string
description: Optional parameters for ucsc gtfToGenePred

fair_genome_indexer_agat_convert_sp_gff2tsv:
type: str
type: string
description: Optional parameters for agat_convert_sp_gff2tsv.pl

fair_genome_indexer_xsv_select_t2g_columns:
type: str
type: string
description: Optional parameters for xsv select

fair_genome_indexer_xsv_fmt_t2g:
type: str
type: string
description: Optional parameters for xsv fmt

fair_genome_indexer_tabix_index_dbsnp:
type: str
type: string
description: Optional parameters for tabix index

fair_genome_indexer_tabix_index_dbsnp_index_raw_dbsnp:
type: str
type: string
description: Optional parameters for tabix index

fair_genome_indexer_samtools_index:
type: str
type: string
description: Optional parameters for samtools faidx

fair_genome_indexer_pyroe_id_to_name:
type: str
type: string
description: Optional parameters for pyroe idtoname

fair_genome_indexer_rsync_make_fasta_available:
type: str
type: string
description: Optional parameters for rsync

fair_genome_indexer_pyfaidx_filter_out_noncanonical_chromosomes:
type: object
description: pyfaidx optional parameters for each subtype of sequences

dna:
type: str
type: string
description: pyfaidx optional parameters for DNA sequences

cdna:
type: str
type: string
description: pyfaidx optional parameters for cDNA sequences

transcripts:
type: str
type: string
description: pyfaidx optional parameters for transcripts DNA sequences

fair_genome_indexer_picard_create_dict:
type: str
type: string
description: Picard CreateSequenceDictionary optional parameters

fair_genome_indexer_gffread_transcripts:
type: str
type: string
description: GffRead optional parameters for transcripts extraction

fair_genome_indexer_gffread_cdna:
type: str
type: string
description: GffRead optional parameters for cDNA extraction

fair_genome_indexer_wget:
type: str
type: string
description: Optional parameters for wget

fair_genome_indexer_bedtools_merge_blacklist:
type: str
type: string
description: Optional parameters for bedtools merge

fair_genome_indexer_bcftools_filter_non_canonical_chrom:
type: str
type: string
description: Optional parameters for bcftools filter

fair_genome_indexer_pyfaidx_fasta_dict_to_bed:
type: str
type: string
description: Optional parameters for pyfaidx

fair_genome_indexer_agat_sq_filter_feature_from_fasta:
type: str
type: string
description: Optional parameters for agat_sq_filter_feature_from_fasta.pl

fair_genome_indexer_agat_sp_filter_feature_by_attribute_value:
type: str
type: string
description: Optional parameters for agat_sp_filter_feature_by_attribute_value.pl

fair_genome_indexer_agat_convert_sp_gff2gtf:
type: str
type: string
escription: Optional parameters for agat_convert_sp_gff2gtf.pl

fair_genome_indexer_agat_config:
type: object
description: Mapping of all agat parameters
description: Mapping of all agat parameters
32 changes: 32 additions & 0 deletions workflow/schemas/genomes.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,38 @@ genepred:
type: string
description: Gene model in genepred format (bed6-like)

genepred_bed:
type: string
description: Gene model in bed format (bed6-like)

bowtie2_dna_index:
type: string
description: Bowtie2 index for DNA sequences

bowtie2_cdna_index:
type: string
description: Bowtie2 index for cDNA sequences

bowtie2_transcripts_index:
type: string
description: Bowtie2 index for cDNA sequences including non-coding

star_dna_index:
type: string
description: STAR index for DNA sequences

star_cdna_index:
type: string
description: STAR index for cDNA sequences

star_transcript_index:
type: string
description: STAR index for cDNA sequences including non-coding

salmon_index:
type: string
description: Decoy aware gentrome index

required:
- species
- build
Expand Down
Loading

0 comments on commit b743f27

Please sign in to comment.