Merge pull request #66 from bioinfo-chru-strasbourg/add_genome_download

add genomes download #3, docs #4 fixes and cleaning
bioinfo-chru-strasbourg · Jun 22, 2023 · be919a7 · be919a7
2 parents c8c7cb4 + dc47e92
commit be919a7
Show file tree

Hide file tree

Showing 15 changed files with 462 additions and 38 deletions.
diff --git a/.env b/.env
@@ -84,6 +84,7 @@ DOCKER_HOWARD_SETUP_IMAGE=${DOCKER_HOWARD_IMAGE}
 DOCKER_HOWARD_SETUP_CONTAINER_NAME=HOWARD-setup
 
 ### Databases folders
+DOCKER_HOWARD_SETUP_GENOMES=${DOCKER_HOWARD_CLI_FOLDER_INNER_DATABASES}/genomes/current
 DOCKER_HOWARD_SETUP_ANNOVAR_DATABASES=${DOCKER_HOWARD_CLI_FOLDER_INNER_DATABASES}/annovar/current
 DOCKER_HOWARD_SETUP_SNPEFF_DATABASES=${DOCKER_HOWARD_CLI_FOLDER_INNER_DATABASES}/snpeff/current
 
@@ -92,12 +93,16 @@ DOCKER_HOWARD_SETUP_SNPEFF_DATABASES=${DOCKER_HOWARD_CLI_FOLDER_INNER_DATABASES}
 # Assembly
 DOCKER_HOWARD_SETUP_ASSEMBLY=hg19,hg38
 
+# Genomes
+DOCKER_HOWARD_SETUP_GENOMES_PROVIDER="UCSC"
+DOCKER_HOWARD_SETUP_GENOMES_CONTIG_REGEX="'^>chr[0-9XYM]*$'"
+
 # Annovar Annotation databases
 DOCKER_HOWARD_SETUP_ANNOTATION="refGene,gnomad_exome,dbnsfp42a,cosmic70,clinvar_202*,nci60"
 
 ### setup command
 # Download Annovar and snpEff databases
-DOCKER_HOWARD_SETUP_CONTAINER_COMMAND='-c "howard --config=/tool/config/config.json download --assembly=${DOCKER_HOWARD_SETUP_ASSEMBLY} --download-annovar=${DOCKER_HOWARD_SETUP_ANNOVAR_DATABASES} --download-annovar-files=${DOCKER_HOWARD_SETUP_ANNOTATION} --download-snpeff=${DOCKER_HOWARD_SETUP_SNPEFF_DATABASES} "'
+DOCKER_HOWARD_SETUP_CONTAINER_COMMAND='-c "howard --config=/tool/config/config.json databases --assembly=${DOCKER_HOWARD_SETUP_ASSEMBLY} --download-genomes=${DOCKER_HOWARD_SETUP_GENOMES} --download-genomes-provider=${DOCKER_HOWARD_SETUP_GENOMES_PROVIDER} --download-genomes-contig-regex=${DOCKER_HOWARD_SETUP_GENOMES_CONTIG_REGEX} --download-annovar=${DOCKER_HOWARD_SETUP_ANNOVAR_DATABASES} --download-annovar-files=${DOCKER_HOWARD_SETUP_ANNOTATION} --download-snpeff=${DOCKER_HOWARD_SETUP_SNPEFF_DATABASES} "'
 
 
 #########

diff --git a/howard/commons.py b/howard/commons.py
@@ -21,6 +21,7 @@
 import zipfile
 import gzip
 import requests
+import genomepy
 
 
 file_folder = os.path.dirname(__file__)
@@ -67,12 +68,20 @@
             "\t".join(vcf_required_columns)
             ]
 
-default_snpeff_bin = "/tools/snpeff/5.1d/bin/snpEff.jar"
+# Tools
+DEFAULT_SNPEFF_BIN = "/tools/snpeff/5.1d/bin/snpEff.jar"
 
-default_annovar_url = "http://www.openbioinformatics.org/annovar/download/"
+# URL
+DEFAULT_ANNOVAR_URL = "http://www.openbioinformatics.org/annovar/download/"
 
+# Databases default folder
+DEFAULT_DATABASE_FOLDER = "/databases"
+DEFAULT_GENOME_FOLDER = f"{DEFAULT_DATABASE_FOLDER}/genomes/current"
+
+# DuckDB extension
 DUCKDB_EXTENSION = f"{file_folder}/duckdb_extension"
 
+# Variables
 MACHIN_LIST = {
     "amd64": "amd64",
     "arm64": "arm64"

diff --git a/howard/tools/annotation.py b/howard/tools/annotation.py
@@ -23,7 +23,16 @@
 
 
 
-def annotation(args) -> None:
+def annotation(args:argparse) -> None:
+    """
+    The `annotation` function performs annotation on a VCF file based on specified parameters and
+    exports the annotated data.
+    
+    :param args: The `args` parameter is likely an object or dictionary containing various arguments
+    passed to the `annotation` function. It is not clear from the code snippet what specific arguments
+    are expected or required
+    :type args: argparse
+    """
 
     log.info("Start")
 

diff --git a/howard/tools/calculation.py b/howard/tools/calculation.py
@@ -23,7 +23,14 @@
 
 
 
-def calculation(args) -> None:
+def calculation(args:argparse) -> None:
+    """
+    This function performs calculations on VCF data based on user input and exports the results.
+    
+    :param args: The `args` parameter is a command line argument parser object that contains the
+    arguments passed to the script when it was executed
+    :type args: argparse
+    """
 
     log.info("Start")
 

diff --git a/howard/tools/convert.py b/howard/tools/convert.py
@@ -23,7 +23,16 @@
 
 
 
-def convert(args) -> None:
+def convert(args:argparse) -> None:
+    """
+    The `convert` function converts a VCF file to a different format and can optionally explode info
+    fields.
+    
+    :param args: `args` is a parameter passed to the `convert` function, likely an object or dictionary
+    containing various arguments needed for the function to perform its task. These arguments could
+    include things like input and output file paths, configuration settings, and other parameters
+    :type args: argparse
+    """
 
     log.info("Start")
 

diff --git a/howard/tools/databases.py b/howard/tools/databases.py
@@ -30,7 +30,16 @@
 from howard.commons import *
 
 
-def databases(args) -> None:
+def databases(args:argparse) -> None:
+    """
+    The function downloads databases and logs the start and end of the process.
+    
+    :param args: The "args" parameter is likely an object or dictionary containing various arguments or
+    options related to the "databases" function. Without more context, it's difficult to say exactly
+    what these arguments might be, but they could include things like the names or locations of
+    databases to download, authentication credentials, or
+    :type args: argparse
+    """
 
     log.info("Start")
 
@@ -39,13 +48,34 @@ def databases(args) -> None:
     log.info("End")
 
 
-def databases_download(args) -> None:
+def databases_download(args:argparse) -> None:
+    """
+    The `databases_download` function downloads genome, Annovar, and snpEff databases based on
+    user-specified arguments.
+    
+    :param args: The `args` parameter is an object of the `argparse` module that contains the input
+    arguments for the `databases_download` function. These arguments are used to determine which genome,
+    Annovar, and snpEff databases to download
+    :type args: argparse
+    """
 
     log.debug(f"Args {args}")
 
     # Assembly
     assemblies = [value for value in args.assembly.split(',')]
 
+    # Genomes
+    if args.download_genomes:
+        log.debug(f"Download Genomes")
+        if assemblies:
+            databases_download_genomes(
+                assemblies=assemblies, 
+                genome_folder=args.download_genomes,
+                provider=args.download_genomes_provider,
+                contig_regex=args.download_genomes_contig_regex
+                )
+
+
     # Annovar
     if args.download_annovar:
         log.debug(f"Download Annovar databases")
@@ -71,6 +101,22 @@ def databases_download(args) -> None:
 
 
 def databases_download_annovar(folder:str = None, files:list = None, assemblies:list = ["hg19"], annovar_url:str = "http://www.openbioinformatics.org/annovar/download/") -> None:
+    """
+    This function downloads and extracts Annovar databases for specified assemblies and files.
+    
+    :param folder: The folder where the Annovar databases will be downloaded to
+    :type folder: str
+    :param files: The `files` parameter is a list of specific Annovar database files to download. If not
+    provided, only the mandatory files will be downloaded. If set to "ALL", all available files will be
+    downloaded
+    :type files: list
+    :param assemblies: A list of genome assemblies for which Annovar databases will be downloaded.
+    Default is ["hg19"]
+    :type assemblies: list
+    :param annovar_url: The URL where Annovar databases can be downloaded from, defaults to
+    http://www.openbioinformatics.org/annovar/download/
+    :type annovar_url: str (optional)
+    """
 
     log.info(f"Download Annovar databases {assemblies}")
 
@@ -188,10 +234,23 @@ def databases_download_annovar(folder:str = None, files:list = None, assemblies:
                     log.debug(f"Extract file {file} to {folder}...")
                     extract_file(file_path)
             else:
-                log.info(f"Download Annovar databases {[assembly]} already exists")
+                log.info(f"Download Annovar databases {[assembly]} - already exists")
 
 
 def databases_download_snpeff(folder:str = None, assemblies:list = ["hg19"], config:dict = {}) -> None:
+    """
+    This function downloads and extracts snpEff databases for specified genome assemblies.
+    
+    :param folder: The folder where the snpEff databases will be downloaded and stored
+    :type folder: str
+    :param assemblies: The assemblies parameter is a list of genome assemblies for which the snpEff
+    databases need to be downloaded
+    :type assemblies: list
+    :param config: The `config` parameter is a dictionary that contains information about the tools and
+    their configurations. It is used to retrieve the path to the Java binary and the path to the snpEff
+    binary
+    :type config: dict
+    """
 
     log.info(f"Download snpEff databases {assemblies}")
 
@@ -202,7 +261,7 @@ def databases_download_snpeff(folder:str = None, assemblies:list = ["hg19"], con
     # database list
     snpeff_databases_list = "snpeff_databases.list"
     snpeff_databases_list_path = os.path.join(folder,snpeff_databases_list)
-
+    
     # create folder if not exists
     if folder:
         if not os.path.exists(folder):
@@ -236,7 +295,7 @@ def databases_download_snpeff(folder:str = None, assemblies:list = ["hg19"], con
 
             # Strat download
             log.info(f"Download snpEff databases for assembly '{assembly}'...")
-
+            #print(snpeff_list_databases.keys())
             # Try to download files
             file_path = None
             for file_url in snpeff_list_databases[assembly]:
@@ -265,5 +324,52 @@ def databases_download_snpeff(folder:str = None, assemblies:list = ["hg19"], con
 
         else:
 
-            log.info(f"Database snpEff databases {[assembly]} already exists")
+            log.info(f"Download snpEff databases {[assembly]} - already exists")
+
+
+def databases_download_genomes(assemblies: list, genome_folder: str = None, provider:str = "UCSC", contig_regex:str = None, threads:int = 1) -> None:
+    """
+    This function downloads genome assemblies using genomepy package with options to specify genome
+    folder, provider, contig regex, and number of threads.
+    
+    :param assemblies: a list of genome assembly names to download
+    :type assemblies: list
+    :param genome_folder: The folder where the downloaded genome files will be saved. If no folder is
+    specified, the default folder will be used
+    :type genome_folder: str
+    :param provider: The provider parameter specifies the source of the genome data. In this case, the
+    default provider is set to "UCSC", which refers to the University of California, Santa Cruz Genome
+    Browser. Other possible providers could include NCBI or Ensembl, defaults to UCSC
+    :type provider: str (optional)
+    :param contig_regex: The contig_regex parameter is a regular expression used to filter the contigs
+    (chromosomes or scaffolds) to be downloaded for a given genome assembly. It allows users to download
+    only a subset of the available contigs, based on their names or other characteristics. If
+    contig_regex is not specified
+    :type contig_regex: str
+    :param threads: The "threads" parameter specifies the number of threads (parallel processes) to use
+    for downloading the genomes. This can speed up the process if the computer has multiple cores or
+    processors. The default value is 1, meaning that the download will be done using a single thread,
+    defaults to 1
+    :type threads: int (optional)
+    :return: None is being returned.
+    """
+
+    log.info(f"Download Genomes {assemblies}")
+
+    if not genome_folder:
+        genome_folder = DEFAULT_GENOME_FOLDER
+
+    if os.path.exists(genome_folder):
+        installed_genomes = genomepy.list_installed_genomes(genomes_dir=genome_folder)
+    else:
+        installed_genomes = []
+
+    for assembly in assemblies:
+        if assembly in installed_genomes:
+            log.info(f"Download Genomes '{[assembly]}' - already exists")
+        else:
+            log.info(f"Download Genomes '{[assembly]}' downloading...")
+            genomepy.install_genome(assembly, annotation=False, provider=provider, genomes_dir=genome_folder, threads=threads, regex=contig_regex)
+
+    return None
 
diff --git a/howard/tools/from_annovar.py b/howard/tools/from_annovar.py
@@ -56,7 +56,14 @@
 
 
 
-def from_annovar(args) -> None:
+def from_annovar(args:argparse) -> None:
+    """
+    This function converts an Annovar database to a VCF and Parquet format.
+    
+    :param args: `args` is an object with several attributes representing the input parameters for the
+    function. These attributes include:
+    :type args: argparse
+    """
 
     log.info("Start")
 

diff --git a/howard/tools/prioritization.py b/howard/tools/prioritization.py
@@ -23,7 +23,16 @@
 
 
 
-def prioritization(args) -> None:
+def prioritization(args:argparse) -> None:
+    """
+    The function performs prioritization on a VCF file based on user-specified configurations and
+    exports the results.
+    
+    :param args: args is an object that contains the command line arguments passed to the script. It is
+    used to configure the behavior of the script and to provide input and output file paths, as well as
+    other parameters needed for the execution of the script
+    :type args: argparse
+    """
 
     log.info("Start")
 

diff --git a/howard/tools/process.py b/howard/tools/process.py
@@ -23,7 +23,17 @@
 
 
 
-def process(args) -> None:
+def process(args:argparse) -> None:
+    """
+    The "process" function processes input arguments, loads parameters in JSON format, creates a VCF
+    object, performs quick annotations, calculations, prioritizations, and queries, exports output, and
+    closes the connection.
+    
+    :param args: args is a variable that contains the arguments passed to the function "process". It is
+    assumed to be an object with several attributes, including "config", "param", "input", "output",
+    "annotations", "calculations", "prioritizations", and "query". These attributes are used to
+    :type args: argparse
+    """
 
     log.info("Start")
 

diff --git a/howard/tools/query.py b/howard/tools/query.py
@@ -23,7 +23,15 @@
 
 
 
-def query(args) -> None:
+def query(args:argparse) -> None:
+    """
+    This Python function loads and queries data from a VCF file based on user input and exports the
+    results.
+    
+    :param args: args is an object that contains the arguments passed to the function. It is likely a
+    Namespace object created by parsing command line arguments using argparse
+    :type args: argparse
+    """
 
     log.info("Start")
 

diff --git a/howard/tools/stats.py b/howard/tools/stats.py
@@ -23,7 +23,17 @@
 
 
 
-def stats(args) -> None:
+def stats(args:argparse) -> None:
+    """
+    The stats() function takes in arguments, loads data from an input file, gets statistics on the data,
+    and closes the connection.
+    
+    :param args: args is a parameter that is passed to the function stats(). It is likely an object or a
+    dictionary that contains various arguments or parameters that are needed by the function to perform
+    its tasks. Some of the arguments that may be included in args are input file path, configuration
+    settings, and other parameters that are
+    :type args: argparse
+    """
 
     log.info("Start")