Skip to content

Commit

Permalink
Merge pull request #66 from bioinfo-chru-strasbourg/add_genome_download
Browse files Browse the repository at this point in the history
add genomes download #3, docs #4 fixes and cleaning
  • Loading branch information
antonylebechec authored Jun 22, 2023
2 parents c8c7cb4 + dc47e92 commit be919a7
Show file tree
Hide file tree
Showing 15 changed files with 462 additions and 38 deletions.
7 changes: 6 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ DOCKER_HOWARD_SETUP_IMAGE=${DOCKER_HOWARD_IMAGE}
DOCKER_HOWARD_SETUP_CONTAINER_NAME=HOWARD-setup

### Databases folders
DOCKER_HOWARD_SETUP_GENOMES=${DOCKER_HOWARD_CLI_FOLDER_INNER_DATABASES}/genomes/current
DOCKER_HOWARD_SETUP_ANNOVAR_DATABASES=${DOCKER_HOWARD_CLI_FOLDER_INNER_DATABASES}/annovar/current
DOCKER_HOWARD_SETUP_SNPEFF_DATABASES=${DOCKER_HOWARD_CLI_FOLDER_INNER_DATABASES}/snpeff/current

Expand All @@ -92,12 +93,16 @@ DOCKER_HOWARD_SETUP_SNPEFF_DATABASES=${DOCKER_HOWARD_CLI_FOLDER_INNER_DATABASES}
# Assembly
DOCKER_HOWARD_SETUP_ASSEMBLY=hg19,hg38

# Genomes
DOCKER_HOWARD_SETUP_GENOMES_PROVIDER="UCSC"
DOCKER_HOWARD_SETUP_GENOMES_CONTIG_REGEX="'^>chr[0-9XYM]*$'"

# Annovar Annotation databases
DOCKER_HOWARD_SETUP_ANNOTATION="refGene,gnomad_exome,dbnsfp42a,cosmic70,clinvar_202*,nci60"

### setup command
# Download Annovar and snpEff databases
DOCKER_HOWARD_SETUP_CONTAINER_COMMAND='-c "howard --config=/tool/config/config.json download --assembly=${DOCKER_HOWARD_SETUP_ASSEMBLY} --download-annovar=${DOCKER_HOWARD_SETUP_ANNOVAR_DATABASES} --download-annovar-files=${DOCKER_HOWARD_SETUP_ANNOTATION} --download-snpeff=${DOCKER_HOWARD_SETUP_SNPEFF_DATABASES} "'
DOCKER_HOWARD_SETUP_CONTAINER_COMMAND='-c "howard --config=/tool/config/config.json databases --assembly=${DOCKER_HOWARD_SETUP_ASSEMBLY} --download-genomes=${DOCKER_HOWARD_SETUP_GENOMES} --download-genomes-provider=${DOCKER_HOWARD_SETUP_GENOMES_PROVIDER} --download-genomes-contig-regex=${DOCKER_HOWARD_SETUP_GENOMES_CONTIG_REGEX} --download-annovar=${DOCKER_HOWARD_SETUP_ANNOVAR_DATABASES} --download-annovar-files=${DOCKER_HOWARD_SETUP_ANNOTATION} --download-snpeff=${DOCKER_HOWARD_SETUP_SNPEFF_DATABASES} "'


#########
Expand Down
13 changes: 11 additions & 2 deletions howard/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import zipfile
import gzip
import requests
import genomepy


file_folder = os.path.dirname(__file__)
Expand Down Expand Up @@ -67,12 +68,20 @@
"\t".join(vcf_required_columns)
]

default_snpeff_bin = "/tools/snpeff/5.1d/bin/snpEff.jar"
# Tools
DEFAULT_SNPEFF_BIN = "/tools/snpeff/5.1d/bin/snpEff.jar"

default_annovar_url = "http://www.openbioinformatics.org/annovar/download/"
# URL
DEFAULT_ANNOVAR_URL = "http://www.openbioinformatics.org/annovar/download/"

# Databases default folder
DEFAULT_DATABASE_FOLDER = "/databases"
DEFAULT_GENOME_FOLDER = f"{DEFAULT_DATABASE_FOLDER}/genomes/current"

# DuckDB extension
DUCKDB_EXTENSION = f"{file_folder}/duckdb_extension"

# Variables
MACHIN_LIST = {
"amd64": "amd64",
"arm64": "arm64"
Expand Down
11 changes: 10 additions & 1 deletion howard/tools/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,16 @@



def annotation(args) -> None:
def annotation(args:argparse) -> None:
"""
The `annotation` function performs annotation on a VCF file based on specified parameters and
exports the annotated data.
:param args: The `args` parameter is likely an object or dictionary containing various arguments
passed to the `annotation` function. It is not clear from the code snippet what specific arguments
are expected or required
:type args: argparse
"""

log.info("Start")

Expand Down
9 changes: 8 additions & 1 deletion howard/tools/calculation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,14 @@



def calculation(args) -> None:
def calculation(args:argparse) -> None:
"""
This function performs calculations on VCF data based on user input and exports the results.
:param args: The `args` parameter is a command line argument parser object that contains the
arguments passed to the script when it was executed
:type args: argparse
"""

log.info("Start")

Expand Down
11 changes: 10 additions & 1 deletion howard/tools/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,16 @@



def convert(args) -> None:
def convert(args:argparse) -> None:
"""
The `convert` function converts a VCF file to a different format and can optionally explode info
fields.
:param args: `args` is a parameter passed to the `convert` function, likely an object or dictionary
containing various arguments needed for the function to perform its task. These arguments could
include things like input and output file paths, configuration settings, and other parameters
:type args: argparse
"""

log.info("Start")

Expand Down
118 changes: 112 additions & 6 deletions howard/tools/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,16 @@
from howard.commons import *


def databases(args) -> None:
def databases(args:argparse) -> None:
"""
The function downloads databases and logs the start and end of the process.
:param args: The "args" parameter is likely an object or dictionary containing various arguments or
options related to the "databases" function. Without more context, it's difficult to say exactly
what these arguments might be, but they could include things like the names or locations of
databases to download, authentication credentials, or
:type args: argparse
"""

log.info("Start")

Expand All @@ -39,13 +48,34 @@ def databases(args) -> None:
log.info("End")


def databases_download(args) -> None:
def databases_download(args:argparse) -> None:
"""
The `databases_download` function downloads genome, Annovar, and snpEff databases based on
user-specified arguments.
:param args: The `args` parameter is an object of the `argparse` module that contains the input
arguments for the `databases_download` function. These arguments are used to determine which genome,
Annovar, and snpEff databases to download
:type args: argparse
"""

log.debug(f"Args {args}")

# Assembly
assemblies = [value for value in args.assembly.split(',')]

# Genomes
if args.download_genomes:
log.debug(f"Download Genomes")
if assemblies:
databases_download_genomes(
assemblies=assemblies,
genome_folder=args.download_genomes,
provider=args.download_genomes_provider,
contig_regex=args.download_genomes_contig_regex
)


# Annovar
if args.download_annovar:
log.debug(f"Download Annovar databases")
Expand All @@ -71,6 +101,22 @@ def databases_download(args) -> None:


def databases_download_annovar(folder:str = None, files:list = None, assemblies:list = ["hg19"], annovar_url:str = "http://www.openbioinformatics.org/annovar/download/") -> None:
"""
This function downloads and extracts Annovar databases for specified assemblies and files.
:param folder: The folder where the Annovar databases will be downloaded to
:type folder: str
:param files: The `files` parameter is a list of specific Annovar database files to download. If not
provided, only the mandatory files will be downloaded. If set to "ALL", all available files will be
downloaded
:type files: list
:param assemblies: A list of genome assemblies for which Annovar databases will be downloaded.
Default is ["hg19"]
:type assemblies: list
:param annovar_url: The URL where Annovar databases can be downloaded from, defaults to
http://www.openbioinformatics.org/annovar/download/
:type annovar_url: str (optional)
"""

log.info(f"Download Annovar databases {assemblies}")

Expand Down Expand Up @@ -188,10 +234,23 @@ def databases_download_annovar(folder:str = None, files:list = None, assemblies:
log.debug(f"Extract file {file} to {folder}...")
extract_file(file_path)
else:
log.info(f"Download Annovar databases {[assembly]} already exists")
log.info(f"Download Annovar databases {[assembly]} - already exists")


def databases_download_snpeff(folder:str = None, assemblies:list = ["hg19"], config:dict = {}) -> None:
"""
This function downloads and extracts snpEff databases for specified genome assemblies.
:param folder: The folder where the snpEff databases will be downloaded and stored
:type folder: str
:param assemblies: The assemblies parameter is a list of genome assemblies for which the snpEff
databases need to be downloaded
:type assemblies: list
:param config: The `config` parameter is a dictionary that contains information about the tools and
their configurations. It is used to retrieve the path to the Java binary and the path to the snpEff
binary
:type config: dict
"""

log.info(f"Download snpEff databases {assemblies}")

Expand All @@ -202,7 +261,7 @@ def databases_download_snpeff(folder:str = None, assemblies:list = ["hg19"], con
# database list
snpeff_databases_list = "snpeff_databases.list"
snpeff_databases_list_path = os.path.join(folder,snpeff_databases_list)

# create folder if not exists
if folder:
if not os.path.exists(folder):
Expand Down Expand Up @@ -236,7 +295,7 @@ def databases_download_snpeff(folder:str = None, assemblies:list = ["hg19"], con

# Strat download
log.info(f"Download snpEff databases for assembly '{assembly}'...")

#print(snpeff_list_databases.keys())
# Try to download files
file_path = None
for file_url in snpeff_list_databases[assembly]:
Expand Down Expand Up @@ -265,5 +324,52 @@ def databases_download_snpeff(folder:str = None, assemblies:list = ["hg19"], con

else:

log.info(f"Database snpEff databases {[assembly]} already exists")
log.info(f"Download snpEff databases {[assembly]} - already exists")


def databases_download_genomes(assemblies: list, genome_folder: str = None, provider:str = "UCSC", contig_regex:str = None, threads:int = 1) -> None:
"""
This function downloads genome assemblies using genomepy package with options to specify genome
folder, provider, contig regex, and number of threads.
:param assemblies: a list of genome assembly names to download
:type assemblies: list
:param genome_folder: The folder where the downloaded genome files will be saved. If no folder is
specified, the default folder will be used
:type genome_folder: str
:param provider: The provider parameter specifies the source of the genome data. In this case, the
default provider is set to "UCSC", which refers to the University of California, Santa Cruz Genome
Browser. Other possible providers could include NCBI or Ensembl, defaults to UCSC
:type provider: str (optional)
:param contig_regex: The contig_regex parameter is a regular expression used to filter the contigs
(chromosomes or scaffolds) to be downloaded for a given genome assembly. It allows users to download
only a subset of the available contigs, based on their names or other characteristics. If
contig_regex is not specified
:type contig_regex: str
:param threads: The "threads" parameter specifies the number of threads (parallel processes) to use
for downloading the genomes. This can speed up the process if the computer has multiple cores or
processors. The default value is 1, meaning that the download will be done using a single thread,
defaults to 1
:type threads: int (optional)
:return: None is being returned.
"""

log.info(f"Download Genomes {assemblies}")

if not genome_folder:
genome_folder = DEFAULT_GENOME_FOLDER

if os.path.exists(genome_folder):
installed_genomes = genomepy.list_installed_genomes(genomes_dir=genome_folder)
else:
installed_genomes = []

for assembly in assemblies:
if assembly in installed_genomes:
log.info(f"Download Genomes '{[assembly]}' - already exists")
else:
log.info(f"Download Genomes '{[assembly]}' downloading...")
genomepy.install_genome(assembly, annotation=False, provider=provider, genomes_dir=genome_folder, threads=threads, regex=contig_regex)

return None

9 changes: 8 additions & 1 deletion howard/tools/from_annovar.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,14 @@



def from_annovar(args) -> None:
def from_annovar(args:argparse) -> None:
"""
This function converts an Annovar database to a VCF and Parquet format.
:param args: `args` is an object with several attributes representing the input parameters for the
function. These attributes include:
:type args: argparse
"""

log.info("Start")

Expand Down
11 changes: 10 additions & 1 deletion howard/tools/prioritization.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,16 @@



def prioritization(args) -> None:
def prioritization(args:argparse) -> None:
"""
The function performs prioritization on a VCF file based on user-specified configurations and
exports the results.
:param args: args is an object that contains the command line arguments passed to the script. It is
used to configure the behavior of the script and to provide input and output file paths, as well as
other parameters needed for the execution of the script
:type args: argparse
"""

log.info("Start")

Expand Down
12 changes: 11 additions & 1 deletion howard/tools/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,17 @@



def process(args) -> None:
def process(args:argparse) -> None:
"""
The "process" function processes input arguments, loads parameters in JSON format, creates a VCF
object, performs quick annotations, calculations, prioritizations, and queries, exports output, and
closes the connection.
:param args: args is a variable that contains the arguments passed to the function "process". It is
assumed to be an object with several attributes, including "config", "param", "input", "output",
"annotations", "calculations", "prioritizations", and "query". These attributes are used to
:type args: argparse
"""

log.info("Start")

Expand Down
10 changes: 9 additions & 1 deletion howard/tools/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,15 @@



def query(args) -> None:
def query(args:argparse) -> None:
"""
This Python function loads and queries data from a VCF file based on user input and exports the
results.
:param args: args is an object that contains the arguments passed to the function. It is likely a
Namespace object created by parsing command line arguments using argparse
:type args: argparse
"""

log.info("Start")

Expand Down
12 changes: 11 additions & 1 deletion howard/tools/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,17 @@



def stats(args) -> None:
def stats(args:argparse) -> None:
"""
The stats() function takes in arguments, loads data from an input file, gets statistics on the data,
and closes the connection.
:param args: args is a parameter that is passed to the function stats(). It is likely an object or a
dictionary that contains various arguments or parameters that are needed by the function to perform
its tasks. Some of the arguments that may be included in args are input file path, configuration
settings, and other parameters that are
:type args: argparse
"""

log.info("Start")

Expand Down
Loading

0 comments on commit be919a7

Please sign in to comment.