From f5a6aa04b4219819e0673c84a60dd6a0846a6342 Mon Sep 17 00:00:00 2001 From: antonylebechec Date: Thu, 8 Feb 2024 00:39:47 +0100 Subject: [PATCH] Add docs and help file generation #4 --- README.md | 2 + config/config.json | 1 - docs/help.html | 737 +++++++++++++++++++++++ docs/help.md | 1310 +++++++++++++++++++++++++++++++++++++++++ howard/main.py | 79 ++- howard/tools/tools.py | 108 +++- 6 files changed, 2222 insertions(+), 15 deletions(-) create mode 100644 docs/help.html create mode 100644 docs/help.md diff --git a/README.md b/README.md index 32cbcda..cd3f47b 100644 --- a/README.md +++ b/README.md @@ -329,3 +329,5 @@ howard --help ## Documentation More documentation in [docs/howard.md](docs/howard.md) + +See help documentation for all available commands and options documentation in [docs/help.md](docs/help.md) diff --git a/config/config.json b/config/config.json index a194ca8..983570e 100644 --- a/config/config.json +++ b/config/config.json @@ -8,7 +8,6 @@ "annotations": [ "/databases/annotations/current/", "/databases/dbnsfp/current/", - "/databases/alphamissense/current/", "/databases/dbsnp/current/" ], "parquet": ["/databases/annotations/current/"], diff --git a/docs/help.html b/docs/help.html new file mode 100644 index 0000000..58f323c --- /dev/null +++ b/docs/help.html @@ -0,0 +1,737 @@ +

HOWARD Help - Commands and Options

+

HOWARD Commands and Options

+

PROCESS

+

howard process tool manage genetic variations to:
- annotates genetic variants with multiple annotation databases/files and tools
- calculates and normalizes annotations
- prioritizes variants with profiles (list of citeria) to calculate scores and flags
- translates into various formats
- query genetic variants and annotations
- generates variants statistics

Usage examples:
   howard process --input=tests/data/example.vcf.gz --output=/tmp/example.annotated.vcf.gz --param=config/param.json
   howard process --input=tests/data/example.vcf.gz --annotations='snpeff' --calculations='snpeff_hgvs' --prioritizations=config/prioritization_profiles.json --explode_infos --output=/tmp/example.annotated.tsv --query='SELECT "#CHROM", POS, ALT, REF, snpeff_hgvs FROM variants'

Main options

+
--input=<input> | required
+Input file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--output=<output> | required
+Output file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--param=<param> ({})
+Parameters file or JSON
+Format: JSON
+Default: {}
+
+

Quick Processes

+
--annotations=<annotations>
+Annotation with databases files, or with tools
+Format: list of files in Parquet, VCF, BED, or keywords
+For a Parquet/VCF/BED file, use file path (e.g. '/path/to/file.parquet')
+For add all availalbe databases, use 'ALL' keyword:
+   - Use 'ALL:<types>:<releases>'
+   - e.g. 'ALL', 'ALL:parquet:current', 'ALL:parquet,vcf:devel'
+For snpeff annotation, use keyword 'snpeff'
+For Annovar annotation, use keyword 'annovar' with annovar code (e.g. 'annovar:refGene', 'annovar:cosmic70')
+
+
--calculations=<operations>
+Calculations on genetic variants information and genotype information
+Example: 'VARTYPE,barcode'
+List of available calculations (unsensitive case, see doc for more information):
+ VARTYPE  snpeff_hgvs  FINDBYPIPELINE  GENOTYPECONCORDANCE  BARCODE  TRIO  VAF  VAF_STATS  DP_STATS 
+
+
--prioritizations=<prioritisations>
+Prioritization file in JSON format (defines profiles, see doc).
+
+
--query=<query>
+Query in SQL format
+Format: SQL
+Example: 'SELECT * FROM variants LIMIT 5'
+
+
--explode_infos
+Explode VCF INFO/Tag into 'variants' table columns.
+default: False
+
+
--explode_infos_prefix=<explode infos prefix>
+Explode VCF INFO/Tag with a specific prefix.
+default: ''
+
+
--explode_infos_fields=<explode infos list> (*)
+Explode VCF INFO/Tag specific fields/tags.
+Keyword '*' specify all available fields, except those already specified.
+Pattern (regex) can be used: '.*_score' for fields named with '_score' at the end.
+Examples:
+   - 'HGVS,SIFT,Clinvar' (list of fields)
+   - 'HGVS,*,Clinvar' (list of fields with all other fields at the end)
+   - 'HGVS,.*_score,Clinvar' (list of 2 fields with all scores in the middle)
+   - 'HGVS,.*_score,*' (1 field, scores, all other fields)
+   - 'HGVS,*,.*_score' (1 field and all other fields,
+                        scores included in other fields)
+default: '*'
+
+
--include_header
+Include header (in VCF format) in output file.
+Only for compatible formats (tab-delimiter format as TSV or BED).
+default: False
+
+

ANNOTATION

+

Annotation is mainly based on a build-in Parquet annotation method, and tools such as BCFTOOLS, Annovar and snpEff. It uses available databases (see Annovar and snpEff) and homemade databases. Format of databases are: parquet, duckdb, vcf, bed, Annovar and snpEff (Annovar and snpEff databases are automatically downloaded, see howard databases tool).

Usage examples:
   howard annotation --input=tests/data/example.vcf.gz --output=/tmp/example.howard.vcf.gz --annotations='tests/databases/annotations/hg19/avsnp150.parquet,tests/databases/annotations/hg19/dbnsfp42a.parquet,tests/databases/annotations/hg19/gnomad211_genome.parquet'
   howard annotation --input=tests/data/example.vcf.gz --output=/tmp/example.howard.tsv --assembly=hg19 --annotations='annovar:refGene,annovar:cosmic70,snpeff,tests/databases/annotations/hg19/clinvar_20210123.parquet'
   howard annotation --input=tests/data/example.vcf.gz --output=/tmp/example.howard.tsv --assembly=hg19 --annotations='ALL:parquet'

Main options

+
--input=<input> | required
+Input file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--output=<output> | required
+Output file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--annotations=<annotations> | required
+Annotation with databases files, or with tools
+Format: list of files in Parquet, VCF, BED, or keywords
+For a Parquet/VCF/BED file, use file path (e.g. '/path/to/file.parquet')
+For add all availalbe databases, use 'ALL' keyword:
+   - Use 'ALL:<types>:<releases>'
+   - e.g. 'ALL', 'ALL:parquet:current', 'ALL:parquet,vcf:devel'
+For snpeff annotation, use keyword 'snpeff'
+For Annovar annotation, use keyword 'annovar' with annovar code (e.g. 'annovar:refGene', 'annovar:cosmic70')
+
+
--assembly=<assembly> (hg19)
+Default assembly
+Default: 'hg19'
+
+

CALCULATION

+

Calculation processes variants information to generate new information, such as: identify variation type (VarType), harmonizes allele frequency (VAF) and calculate sttistics (VAF_stats), extracts Nomen (transcript, cNomen, pNomen...) from an HGVS field (e.g. snpEff, Annovar) with an optional list of personalized transcripts, generates VaRank format barcode, identify trio inheritance.

Usage examples:
   howard calculation --input=tests/data/example.full.vcf --output=/tmp/example.calculation.tsv --calculations='vartype'
   howard calculation --input=tests/data/example.ann.vcf.gz --output=/tmp/example.calculated.tsv --calculations='snpeff_hgvs,NOMEN' --hgvs_field=snpeff_hgvs --transcripts=tests/data/transcripts.tsv
   howard calculation --show_calculations

Main options

+
--input=<input>
+Input file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--output=<output>
+Output file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--calculations=<operations>
+Calculations on genetic variants information and genotype information
+Example: 'VARTYPE,barcode'
+List of available calculations (unsensitive case, see doc for more information):
+ VARTYPE  snpeff_hgvs  FINDBYPIPELINE  GENOTYPECONCORDANCE  BARCODE  TRIO  VAF  VAF_STATS  DP_STATS 
+
+
--calculation_config=<calculation config>
+Calculation config file
+Format: JSON
+
+
--show_calculations
+Show available calculation operations
+
+

NOMEN calculation

+
--hgvs_field=<HGVS field> (hgvs)
+HGVS INFO/tag containing a list o HGVS annotations
+default: 'hgvs'
+
+
--transcripts=<transcripts>
+Transcripts file in TSV format
+Format: Transcript in first column, optional Gene in second column 
+default: None
+
+

TRIO calculation

+
--trio_pedigree=<trio pedigree>
+Pedigree Trio for trio inheritance calculation
+Format: JSON file or dict (e.g. 'trio.ped.json', '{"father":"sample1", "mother":"sample2", "child":"sample3"}') 
+default: None
+
+

HGVS

+

HGVS annotation using HUGO HGVS internation Sequence Variant Nomenclature (http://varnomen.hgvs.org/). Annotation refere to refGene and genome to generate HGVS nomenclature for all available transcripts. This annotation add 'hgvs' field into VCF INFO column of a VCF file.

Usage examples:
   howard hgvs --input=tests/data/example.full.vcf --output=/tmp/example.hgvs.vcf

Main options

+
--input=<input> | required
+Input file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--output=<output>
+Output file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--assembly=<assembly> (hg19)
+Default assembly
+Default: 'hg19'
+
+

HGVS

+
--use_gene
+Use Gene information to generate HGVS annotation
+Example: 'NM_152232(TAS1R2):c.231T>C'
+
+
--use_exon
+Use Exon information to generate HGVS annotation
+Only if 'use_gene' is not enabled
+Example: 'NM_152232(exon2):c.231T>C'
+
+
--use_protein
+Use Protein level to generate HGVS annotation
+Can be used with 'use_exon' or 'use_gene'
+Example: 'NP_689418:p.Cys77Arg'
+
+
--add_protein
+Add Protein level to DNA HGVS annotation
+Example: 'NM_152232:c.231T>C,NP_689418:p.Cys77Arg'
+
+
--full_format
+Generates HGVS annotation in a full format (non-standard)
+Full format use all information to generates an exhaustive annotation.
+Use specifically 'use_exon' to add exon information.
+Example: 'TAS1R2:NM_152232:NP_689418:c.231T>C:p.Cys77Arg'
+         'TAS1R2:NM_152232:NP_689418:exon2:c.231T>C:p.Cys77Arg'
+
+
--codon_type=<Codon type> ['1', '3', 'FULL'] (3)
+Amino Acide Codon format type to use to generate HGVS annotation
+Available (default '3'):
+   '1': codon in 1 caracter (e.g. 'C', 'R')
+   '3': codon in 3 caracter (e.g. 'Cys', 'Arg')
+   'FULL': codon in full name (e.g. 'Cysteine', 'Arginine')
+
+
+

Databases

+
--refgene=<refGene>
+refGene annotation file
+
+
--refseqlink=<refSeqLink>
+refSeqLink annotation file
+
+
--genomes-folder=<genomes> (/databases/genomes/current)
+Folder containing genomes
+Default: /databases/genomes/current
+
+

PRIORITIZATION

+

Prioritization algorithm uses profiles to flag variants (as passed or filtered), calculate a prioritization score, and automatically generate a comment for each variants (example: 'polymorphism identified in dbSNP. associated to Lung Cancer. Found in ClinVar database'). Prioritization profiles are defined in a configuration file in JSON format. A profile is defined as a list of annotation/value, using wildcards and comparison options (contains, lower than, greater than, equal...). Annotations fields may be quality values (usually from callers, such as 'DP') or other annotations fields provided by annotations tools, such as HOWARD itself (example: COSMIC, Clinvar, 1000genomes, PolyPhen, SIFT). Multiple profiles can be used simultaneously, which is useful to define multiple validation/prioritization levels (example: 'standard', 'stringent', 'rare variants', 'low allele frequency').

Usage examples:
   howard prioritization --input=tests/data/example.vcf.gz --output=/tmp/example.prioritized.vcf.gz --prioritizations=config/prioritization_profiles.json --profiles='default,GERMLINE'

Main options

+
--input=<input> | required
+Input file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--output=<output> | required
+Output file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--prioritizations=<prioritisations> | required
+Prioritization file in JSON format (defines profiles, see doc).
+
+

Prioritization

+
--profiles=<profiles>
+Prioritization profiles to use (based on file in JSON).
+default: all profiles available
+
+
--default_profile=<default profile>
+Prioritization profile by default (see doc)
+default: First profile in JSON file
+
+
--pzfields=<pzfields> (PZScore,PZFlag)
+Prioritization fields to provide (see doc).
+available: PZScore, PZFlag, PZTags, PZComment, PZInfos
+default: PZScore,PZFlag
+
+
--prioritization_score_mode=<prioritization score mode> ['HOWARD', 'VaRank'] (HOWARD)
+Prioritization Score mode (see doc).
+available: HOWARD (increment score), VaRank (max score)
+default: HOWARD
+
+

QUERY

+

Query genetic variations in SQL format. Data can be loaded into 'variants' table from various formats (e.g. VCF, TSV, Parquet...). Using --explode_infos allow query on INFO/tag annotations. SQL query can also use external data within the request, such as a Parquet file(s).

Usage examples:
   howard query --input=tests/data/example.vcf.gz --query="SELECT * FROM variants WHERE REF = 'A' AND POS < 100000"
   howard query --input=tests/data/example.vcf.gz --explode_infos --query='SELECT "#CHROM", POS, REF, ALT, DP, CLNSIG, sample2, sample3 FROM variants WHERE DP >= 50 OR CLNSIG NOT NULL ORDER BY DP DESC'
   howard query --query="SELECT \"#CHROM\", POS, REF, ALT, \"INFO/Interpro_domain\" FROM 'tests/databases/annotations/hg19/dbnsfp42a.parquet' WHERE \"INFO/Interpro_domain\" NOT NULL ORDER BY \"INFO/SiPhy_29way_logOdds_rankscore\" DESC LIMIT 10"
   howard query --explode_infos --explode_infos_prefix='INFO/' --query="SELECT \"#CHROM\", POS, REF, ALT, STRING_AGG(INFO, ';') AS INFO FROM 'tests/databases/annotations/hg19/*.parquet' GROUP BY \"#CHROM\", POS, REF, ALT" --output=/tmp/full_annotation.tsv && head -n2 /tmp/full_annotation.tsv

Main options

+
--input=<input>
+Input file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--output=<output>
+Output file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--query=<query> | required
+Query in SQL format
+Format: SQL
+Example: 'SELECT * FROM variants LIMIT 5'
+
+

Explode infos

+
--explode_infos
+Explode VCF INFO/Tag into 'variants' table columns.
+default: False
+
+
--explode_infos_prefix=<explode infos prefix>
+Explode VCF INFO/Tag with a specific prefix.
+default: ''
+
+
--explode_infos_fields=<explode infos list> (*)
+Explode VCF INFO/Tag specific fields/tags.
+Keyword '*' specify all available fields, except those already specified.
+Pattern (regex) can be used: '.*_score' for fields named with '_score' at the end.
+Examples:
+   - 'HGVS,SIFT,Clinvar' (list of fields)
+   - 'HGVS,*,Clinvar' (list of fields with all other fields at the end)
+   - 'HGVS,.*_score,Clinvar' (list of 2 fields with all scores in the middle)
+   - 'HGVS,.*_score,*' (1 field, scores, all other fields)
+   - 'HGVS,*,.*_score' (1 field and all other fields,
+                        scores included in other fields)
+default: '*'
+
+

Query

+
--query_limit=<query limit> (10)
+Limit of number of row for query (only for print result, not output).
+default: 10
+
+
--query_print_mode=<print mode>
+Print mode of query result (only for print result, not output).
+Either None (native), 'markdown' or 'tabulate'.
+default: None
+
+

Output

+
--include_header
+Include header (in VCF format) in output file.
+Only for compatible formats (tab-delimiter format as TSV or BED).
+default: False
+
+

STATS

+

Statistics on genetic variations, such as: number of variants, number of samples, statistics by chromosome, genotypes by samples...

Usage examples:
   howard stats --input=tests/data/example.vcf.gz

Main options

+
--input=<input> | required
+Input file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--stats_md=<stats markdown>
+Stats Output file in MarkDown format
+
+
+
--stats_json=<stats json>
+Stats Output file in JSON format
+
+
+

CONVERT

+

Convert genetic variations file to another format. Multiple format are available, such as usual and official VCF and BCF format, but also other formats such as TSV, CSV, PSV and Parquet/duckDB. These formats need a header '.hdr' file to take advantage of the power of howard (especially through INFO/tag definition), and using howard convert tool automatically generate header file fo futher use.

Usage examples:
   howard convert --input=tests/data/example.vcf.gz --output=/tmp/example.tsv
   howard convert --input=tests/data/example.vcf.gz --output=/tmp/example.parquet
   howard convert --input=tests/data/example.vcf.gz --output=/tmp/example.tsv --explode_infos --explode_infos_fields='CLNSIG,SIFT,DP' --order_by='CLNSIG DESC, DP DESC'
   howard convert --input=tests/data/example.vcf.gz --output=/tmp/example.tsv --explode_infos --explode_infos_prefix='INFO/' --explode_infos_fields='CLNSIG,SIFT,DP,*' --order_by='"INFO/CLNSIG" DESC, "INFO/DP" DESC' --include_header

Main options

+
--input=<input> | required
+Input file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--output=<output> | required
+Output file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--explode_infos
+Explode VCF INFO/Tag into 'variants' table columns.
+default: False
+
+
--explode_infos_prefix=<explode infos prefix>
+Explode VCF INFO/Tag with a specific prefix.
+default: ''
+
+
--explode_infos_fields=<explode infos list> (*)
+Explode VCF INFO/Tag specific fields/tags.
+Keyword '*' specify all available fields, except those already specified.
+Pattern (regex) can be used: '.*_score' for fields named with '_score' at the end.
+Examples:
+   - 'HGVS,SIFT,Clinvar' (list of fields)
+   - 'HGVS,*,Clinvar' (list of fields with all other fields at the end)
+   - 'HGVS,.*_score,Clinvar' (list of 2 fields with all scores in the middle)
+   - 'HGVS,.*_score,*' (1 field, scores, all other fields)
+   - 'HGVS,*,.*_score' (1 field and all other fields,
+                        scores included in other fields)
+default: '*'
+
+
--order_by=<order by>
+List of columns to sort the result-set in ascending or descending order.
+Use SQL format, and keywords ASC (ascending) and DESC (descending).
+If a column is not available, order will not be considered.
+Order is enable only for compatible format (e.g. TSV, CSV, JSON).
+Examples:
+   - 'ACMG_score DESC'
+   - 'PZFlag DESC, PZScore DESC'
+default: ''
+
+
--include_header
+Include header (in VCF format) in output file.
+Only for compatible formats (tab-delimiter format as TSV or BED).
+default: False
+
+
--parquet_partitions=<parquet partitions>
+Parquet partitioning using huve (only for Parquet export format).
+This option is is faster parallel writing, but memory consuming.
+Use 'None' (string) for NO partition but split parquet files into a folder
+examples: '#CHROM', '#CHROM,REF', 'None'
+default: None
+
+

DATABASES

+

Download databases and needed files for howard and associated tools

Usage examples:
   howard databases --assembly=hg19 --download-genomes=/databases/genomes/current --download-genomes-provider=UCSC --download-genomes-contig-regex='chr[0-9XYM]+$' --download-annovar=/databases/annovar/current --download-annovar-files='refGene,cosmic70,nci60' --download-snpeff=/databases/snpeff/current --download-refseq=/databases/refseq/current --download-refseq-format-file='ncbiRefSeq.txt' --download-dbnsfp=/databases/dbnsfp/current --download-dbnsfp-release='4.4a' --download-dbnsfp-subdatabases --download-alphamissense=/databases/alphamissense/current --download-exomiser=/databases/exomiser/current --download-dbsnp=/databases/dbsnp/current --download-dbsnp-vcf --threads=8
   howard databases --generate-param=/tmp/param.json --generate-param-description=/tmp/test.description.json --generate-param-formats=parquet
Notes:
   - Downloading databases can take a while, depending on network, threads and memory
   - Proxy: Beware of network and proxy configuration
   - dbNSFP download: More threads, more memory usage (8 threads ~ 16Gb, 24 threads ~ 32Gb)

Main options

+
--assembly=<assembly> (hg19)
+Default assembly
+Default: 'hg19'
+
+
--genomes-folder=<genomes> (/databases/genomes/current)
+Folder containing genomes
+Default: /databases/genomes/current
+
+

Genomes

+
--download-genomes=<genomes>
+Download Genomes within folder
+
+
--download-genomes-provider=<genomes provider> (UCSC)
+Download Genome from an external provider
+Available: GENCODE, Ensembl, UCSC, NCBI
+Default: UCSC
+
+
+
--download-genomes-contig-regex=<genomes contig regex>
+Regular expression to select specific chromosome 
+Default: None
+Example: 'chr[0-9XYM]+$'
+
+
+

snpEff

+
--download-snpeff=<snpEff>
+Download snpEff databases within snpEff folder
+
+

Annovar

+
--download-annovar=<Annovar>
+Download Annovar databases within Annovar folder
+
+
--download-annovar-files=<Annovar code>
+Download Annovar databases for a list of Annovar file code (see Annovar Doc)
+Default: All available files
+Example: refGene,gnomad211_exome,cosmic70,clinvar_202*,nci60
+Note: refGene will be at leaset downloaded
+Note2: Only file that not exists or with a different size will be downloaded
+
+
--download-annovar-url=<Annovar url> (http://www.openbioinformatics.org/annovar/download)
+Download Annovar databases URL (see Annovar Doc)
+Default: 'http://www.openbioinformatics.org/annovar/download'
+
+

refSeq

+
--download-refseq=<refSeq>
+Download refSeq databases within refSeq folder
+
+
--download-refseq-url=<refSeq url> (http://hgdownload.soe.ucsc.edu/goldenPath)
+Download refSeq databases URL (see refSeq WebSite)
+Default: 'http://hgdownload.soe.ucsc.edu/goldenPath'
+
+
--download-refseq-prefix=<refSeq prefix> (ncbiRefSeq)
+Check existing refSeq files in refSeq folder
+Default: 'ncbiRefSeq'
+
+
--download-refseq-files=<refSeq files> (ncbiRefSeq.txt,ncbiRefSeqLink.txt)
+List of refSeq files to download
+Default: 'ncbiRefSeq.txt,ncbiRefSeqLink.txt'
+
+
--download-refseq-format-file=<refSeq format file>
+Name of refSeq file to format in BED format
+Exemple: 'ncbiRefSeq.txt'
+Default: None
+
+
--download-refseq-include-utr5
+Formating BED refSeq file including 5'UTR
+
+
--download-refseq-include-utr3
+Formating BED refSeq file including 3'UTR
+
+
--download-refseq-include-chrM
+Formating BED refSeq file including Mitochondiral chromosome 'chrM' or 'chrMT'
+
+
--download-refseq-include-non-canonical-chr
+Formating BED refSeq file including non canonical chromosomes
+
+
--download-refseq-include-non-coding-transcripts
+Formating BED refSeq file including non coding transcripts
+
+
--download-refseq-include-transcript-version
+Formating BED refSeq file including transcript version
+
+

dbNSFP

+
--download-dbnsfp=<dbNSFP>
+Download dbNSFP databases within dbNSFP folder
+
+
--download-dbnsfp-url=<dbNSFP url> (https://dbnsfp.s3.amazonaws.com)
+Download dbNSFP databases URL (see dbNSFP website)
+Default: 'https://dbnsfp.s3.amazonaws.com'
+
+
--download-dbnsfp-release=<dnNSFP release> (4.4a)
+Release of dbNSFP to download (see dbNSFP website)
+Default: '4.4a'
+
+
--download-dbnsfp-parquet-size=<dbNSFP parquet size> (100)
+Maximum size (Mb) of data files in Parquet folder.
+Parquet folder are partitioned (hive) by chromosome (sub-folder),
+which contain N data files.
+Default: 100
+
+
--download-dbnsfp-subdatabases
+Generate dbNSFP sub-databases
+dbNSFP provides multiple databases which are split onto multiple columns.
+This option create a Parquet folder for each sub-database (based on columns names).
+
+
--download-dbnsfp-parquet
+Generate a Parquet file for each Parquet folder.
+
+
--download-dbnsfp-vcf
+Generate a VCF file for each Parquet folder.
+Note: Need genome (see --download-genome)
+
+
--download-dbnsfp-no-files-all
+Not generate database Parquet/VCF file for the entire database ('ALL').
+Only sub-databases files will be generated.
+(see '--download-dbnsfp-subdatabases')
+
+
--download-dbnsfp-add-info
+Add INFO column (VCF format) in Parquet folder and file.
+Useful for speed up full annotation (all available columns).
+Increase memory and space during generation of files.
+
+
--download-dbnsfp-row-group-size=<dnNSFP row grooup size> (100000)
+minimum number of rows in a parquet row group (see duckDB doc).
+Lower can reduce memory usage and slightly increase space during generation,
+speed up highly selective queries, slow down whole file queries (e.g. aggregations)
+Default: 100000
+
+

AlphaMissense

+
--download-alphamissense=<AlphaMissense>
+Download AlphaMissense databases within Annotations folder
+
+
--download-alphamissense-url=<AlphaMissense url> (https://storage.googleapis.com/dm_alphamissense)
+Download AlphaMissense databases URL (see AlphaMissense website)
+Default: 'https://storage.googleapis.com/dm_alphamissense'
+
+

Exomiser

+
--download-exomiser=<Exomiser>
+Download Exomiser databases
+Folder where the Exomiser databases will be downloaded and stored.
+If the folder does not exist, it will be created.
+
+
--download-exomiser-application-properties=<Exomiser aplication properties>
+Exomiser Application Properties configuration file (see Exomiser website)
+This file contains configuration settings for the Exomiser tool.
+If this parameter is not provided, the function will attempt to locate
+the application properties file automatically based on the Exomiser.
+Configuration information will be used to download expected releases (if no other parameters)
+CADD and REMM will be downloaded only if 'path' are provided
+
+
+
--download-exomiser-url=<Exomiser url> (http://data.monarchinitiative.org/exomiser)
+URL where Exomiser database files can be downloaded from.
+Default: 'http://data.monarchinitiative.org/exomiser'
+
+
--download-exomiser-release=<Exomiser release>
+Release of Exomiser data to download.
+If "default", "auto", or "config", retrieve from Application Properties file.
+Default: None
+
+
--download-exomiser-phenotype-release=<Exomiser phenoptye release>
+Release of Exomiser phenotype to download.
+If not provided, retrieve from Application Properties file or Exomiser data release
+Default: None
+
+
--download-exomiser-remm-release=<Exomiser remm release>
+Release of ReMM (Regulatory Mendelian Mutation) database to download.
+If "default", "auto", or "config", retrieve from Application Properties file.
+Default: None
+
+
--download-exomiser-remm-url=<Exomiser remm url> (https://kircherlab.bihealth.org/download/ReMM)
+URL where ReMM (Regulatory Mendelian Mutation) database files can be downloaded from.
+Default: 'https://kircherlab.bihealth.org/download/ReMM'
+
+
--download-exomiser-cadd-release=<Exomiser cadd release>
+Release of CADD (Combined Annotation Dependent Depletion) database to download.
+If "default", "auto", or "config", retrieve from Application Properties file.
+Default: None
+
+
--download-exomiser-cadd-url=<Exomiser cadd url> (https://kircherlab.bihealth.org/download/CADD)
+URL where CADD (Combined Annotation Dependent Depletion) database files can be downloaded from.
+Default: 'https://kircherlab.bihealth.org/download/CADD'
+
+
--download-exomiser-cadd-url-snv-file=<Exomiser url snv> (whole_genome_SNVs.tsv.gz)
+Name of the file containing the SNV (Single Nucleotide Variant) data
+for the CADD (Combined Annotation Dependent Depletion) database.
+Default: 'whole_genome_SNVs.tsv.gz'
+
+
--download-exomiser-cadd-url-indel-file=<Exomiser cadd url indel> (InDels.tsv.gz)
+Name of the file containing the INDEL (Insertion-Deletion) data
+for the CADD (Combined Annotation Dependent Depletion) database.
+Default: 'InDels.tsv.gz'
+
+

dbSNP

+
--download-dbsnp=<dnSNP>
+Download dbSNP databases
+Folder where the dbSNP databases will be downloaded and stored.
+If the folder does not exist, it will be created.
+
+
--download-dbsnp-releases=<dnSNP releases> (b156)
+Release of dbSNP to download
+Example: 'b152,b156'Default: 'b156'
+
+
--download-dbsnp-release-default=<dnSNP release default>
+Default Release of dbSNP ('default' symlink)
+If None, first release to download will be assigned as default
+only if it does not exists
+Example: 'b156'
+Default: None (first releases by default)
+
+
--download-dbsnp-url=<dbSNP url> (https://ftp.ncbi.nih.gov/snp/archive)
+URL where dbSNP database files can be downloaded from.
+Default: 'https://ftp.ncbi.nih.gov/snp/archive'
+
+
--download-dbsnp-url-files=<dbSNP url files>
+Dictionary that maps assembly names to specific dbSNP URL files.
+It allows you to provide custom dbSNP URL files for specific assemblies
+instead of using the default file naming convention
+Default: None
+
+
--download-dbsnp-url-files-prefix=<dbSNP url files prefix> (GCF_000001405)
+String that represents the prefix of the dbSNP file name for a specific assembly.
+It is used to construct the full URL of the dbSNP file to be downloaded.
+Default: 'GCF_000001405'
+
+
--download-dbsnp-assemblies-map=<dbSNP assemblies map> ({'hg19': '25', 'hg38': '40'})
+dictionary that maps assembly names to their corresponding dbSNP versions.
+It is used to construct the dbSNP file name based on the assembly name.
+Default: {"hg19": "25", "hg38": "40"}
+
+
--download-dbsnp-vcf
+Generate well-formatted VCF from downloaded file:
+- Add and filter contigs associated to assembly
+- Normalize by splitting multiallelics - Need genome (see --download-genome)
+
+
--download-dbsnp-parquet
+Generate Parquet file from VCF
+
+
+

HGMD

+
--convert-hgmd=<HGMD>
+Convert HGMD databases
+Folder where the HGMD databases will be stored.
+Fields in VCF, Parquet and TSV will be generated.
+If the folder does not exist, it will be created.
+
+
--convert-hgmd-file=<HGMD file>
+File from HGMD
+Name format 'HGMD_Pro_<release>_<assembly>.vcf.gz'.
+
+
--convert-hgmd-basename=<HGMD basename>
+File output basename
+Generated files will be prefixed by basename.
+Example: 'HGMD_Pro_MY_RELEASE'
+Default: Use input file name without '.vcf.gz'
+
+

Parameters file

+
--generate-param=<param>
+Parameter file (JSON) with all databases found.
+Databases folders scanned are defined in config file.
+Structure of databases follow this structure (see doc):
+   .../<database>/<release>/<assembly>/*.[parquet|vcf.gz|...]
+
+
--generate-param-description=<param description>
+Description file (JSON) with all databases found.
+Contains all databases with description of format, assembly, fields...
+
+
--generate-param-releases=<param release> (current)
+List of database folder releases to check
+Examples: 'current', 'latest'
+Default: 'current'
+
+
--generate-param-formats=<param formats> (parquet)
+List of database formats to check (e.g. parquet, vcf, bed, tsv...)
+Examples: 'parquet', 'parquet,vcf,bed,tsv'
+Default: 'parquet'
+
+
--generate-param-bcftools
+Generate parameter file with BCFTools annotation for allowed formats
+Allowed formats with BCFTools: 'vcf', 'bed'
+
+

FROM_ANNOVAR

+

(beta) Formatting Annovar database file to other format (VCF and Parquet). Exported Parquet file includes INFO/tags columns as VCF INFO columns had been exploded

Usage examples:
   howard from_annovar --input=tests/databases/others/hg19_nci60.txt --output=/tmp/nci60.from_annovar.vcf.gz --to_parquet=/tmp/nci60.from_annovar.parquet --annovar-code=nci60 --genome=/databases/genomes/current/hg19.fa --config=/tool/config/config.json --threads=8

Main options

+
--input=<input> | required
+Input file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--output=<output> | required
+Output file path
+Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB
+Files can be compressesd (e.g. vcf.gz, tsv.gz)
+
+
--genome=<genome> (hg19.fa) | required
+Genome file in fasta format
+Default: 'hg19.fa'
+
+

Annovar

+
--annovar-code=<Annovar code>
+Annovar code, or database name. Usefull to name databases columns
+
+

Parquet

+
--to_parquet=<to parquet>
+Parquet file conversion
+
+
+

Modes

+
--reduce_memory=<reduce memory> (auto)
+Reduce memory option
+Values: 'auto' (auto-detection), 'enable', 'disable'
+default: 'auto'
+
+
--multi_variant=<multi variant> (auto)
+Variant with multiple annotation lines
+Values: 'auto' (auto-detection), 'enable', 'disable'
+default: 'auto'
+
+

HELP

+

Help tools

Usage examples:
   howard help --help_md=/tmp/howard.help.mk --help_html=/tmp/howard.help.html

Main options

+
--help_md=<help markdown>
+Help Output file in MarkDown format
+
+
+
--help_html=<help html>
+Help Output file in HTML format
+
+
+

SHARED ARGUMENTS

+
--config=<config> ({})
+Configuration file
+Default: {}
+
+
--threads=<threads> (-1)
+Number of threads to use
+Use -1 to detect number of CPU/cores
+Default: -1
+
+
--memory=<memory>
+Memory to use (FLOAT[kMG])
+Default: None (80%% of RAM)
+
+
--chunk_size=<chunk size> (1000000)
+Number of records in batch to export output file.
+The lower the chunk size, the less memory consumption.
+For Parquet partitioning, files size will depend on the chunk size.
+default: 1000000
+
+
--tmp=<tmp>
+Temporary folder.
+Especially for duckDB, default '.tmp' (see doc).
+default: None
+
+
--duckdb_settings=<duckDB settings>
+DuckDB settings (see duckDB doc) as JSON (string or file).
+These settings have priority (see options 'threads', 'tmp'...).
+Examples: '{"TimeZone": "GMT", "temp_directory": "/tmp/duckdb", "threads": 8}'
+default: None
+
+
--verbosity=<verbosity> ['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'NOTSET'] (INFO)
+Verbosity level
+Available: CRITICAL, ERROR, WARNING, INFO, DEBUG or NOTSET
+Default: INFO
+
+
--log=<log>
+Logs file
+Example: 'my.log'
+Default: None
+
+
--quiet
+==SUPPRESS==
+
+
--verbose
+==SUPPRESS==
+
+
--debug
+==SUPPRESS==
+
+
\ No newline at end of file diff --git a/docs/help.md b/docs/help.md new file mode 100644 index 0000000..21fda97 --- /dev/null +++ b/docs/help.md @@ -0,0 +1,1310 @@ +# HOWARD Help +HOWARD Commands and Options +## PROCESS +howard process tool manage genetic variations to: + +- annotates genetic variants with multiple annotation databases/files and tools + +- calculates and normalizes annotations + +- prioritizes variants with profiles (list of citeria) to calculate scores and flags + +- translates into various formats + +- query genetic variants and annotations + +- generates variants statistics + +Usage examples: + +> howard process --input=tests/data/example.vcf.gz --output=/tmp/example.annotated.vcf.gz --param=config/param.json + +> howard process --input=tests/data/example.vcf.gz --annotations='snpeff' --calculations='snpeff_hgvs' --prioritizations=config/prioritization_profiles.json --explode_infos --output=/tmp/example.annotated.tsv --query='SELECT "#CHROM", POS, ALT, REF, snpeff_hgvs FROM variants' + + + +### Main options +``` +--input= | required + +Input file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--output= | required + +Output file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--param= ({}) + +Parameters file or JSON +Format: JSON +Default: {} +``` + +### Quick Processes +``` +--annotations= + +Annotation with databases files, or with tools +Format: list of files in Parquet, VCF, BED, or keywords +For a Parquet/VCF/BED file, use file path (e.g. '/path/to/file.parquet') +For add all availalbe databases, use 'ALL' keyword: + - Use 'ALL::' + - e.g. 'ALL', 'ALL:parquet:current', 'ALL:parquet,vcf:devel' +For snpeff annotation, use keyword 'snpeff' +For Annovar annotation, use keyword 'annovar' with annovar code (e.g. 'annovar:refGene', 'annovar:cosmic70') +``` + +``` +--calculations= + +Calculations on genetic variants information and genotype information +Example: 'VARTYPE,barcode' +List of available calculations (unsensitive case, see doc for more information): + VARTYPE snpeff_hgvs FINDBYPIPELINE GENOTYPECONCORDANCE BARCODE TRIO VAF VAF_STATS DP_STATS +``` + +``` +--prioritizations= + +Prioritization file in JSON format (defines profiles, see doc). +``` + +``` +--query= + +Query in SQL format +Format: SQL +Example: 'SELECT * FROM variants LIMIT 5' +``` + +``` +--explode_infos + +Explode VCF INFO/Tag into 'variants' table columns. +default: False +``` + +``` +--explode_infos_prefix= + +Explode VCF INFO/Tag with a specific prefix. +default: '' +``` + +``` +--explode_infos_fields= (*) + +Explode VCF INFO/Tag specific fields/tags. +Keyword '*' specify all available fields, except those already specified. +Pattern (regex) can be used: '.*_score' for fields named with '_score' at the end. +Examples: + - 'HGVS,SIFT,Clinvar' (list of fields) + - 'HGVS,*,Clinvar' (list of fields with all other fields at the end) + - 'HGVS,.*_score,Clinvar' (list of 2 fields with all scores in the middle) + - 'HGVS,.*_score,*' (1 field, scores, all other fields) + - 'HGVS,*,.*_score' (1 field and all other fields, + scores included in other fields) +default: '*' +``` + +``` +--include_header + +Include header (in VCF format) in output file. +Only for compatible formats (tab-delimiter format as TSV or BED). +default: False +``` + + + +## ANNOTATION +Annotation is mainly based on a build-in Parquet annotation method, and tools such as BCFTOOLS, Annovar and snpEff. It uses available databases (see Annovar and snpEff) and homemade databases. Format of databases are: parquet, duckdb, vcf, bed, Annovar and snpEff (Annovar and snpEff databases are automatically downloaded, see howard databases tool). + +Usage examples: + +> howard annotation --input=tests/data/example.vcf.gz --output=/tmp/example.howard.vcf.gz --annotations='tests/databases/annotations/hg19/avsnp150.parquet,tests/databases/annotations/hg19/dbnsfp42a.parquet,tests/databases/annotations/hg19/gnomad211_genome.parquet' + +> howard annotation --input=tests/data/example.vcf.gz --output=/tmp/example.howard.tsv --assembly=hg19 --annotations='annovar:refGene,annovar:cosmic70,snpeff,tests/databases/annotations/hg19/clinvar_20210123.parquet' + +> howard annotation --input=tests/data/example.vcf.gz --output=/tmp/example.howard.tsv --assembly=hg19 --annotations='ALL:parquet' + + + +### Main options +``` +--input= | required + +Input file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--output= | required + +Output file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--annotations= | required + +Annotation with databases files, or with tools +Format: list of files in Parquet, VCF, BED, or keywords +For a Parquet/VCF/BED file, use file path (e.g. '/path/to/file.parquet') +For add all availalbe databases, use 'ALL' keyword: + - Use 'ALL::' + - e.g. 'ALL', 'ALL:parquet:current', 'ALL:parquet,vcf:devel' +For snpeff annotation, use keyword 'snpeff' +For Annovar annotation, use keyword 'annovar' with annovar code (e.g. 'annovar:refGene', 'annovar:cosmic70') +``` + +``` +--assembly= (hg19) + +Default assembly +Default: 'hg19' +``` + + + +## CALCULATION +Calculation processes variants information to generate new information, such as: identify variation type (VarType), harmonizes allele frequency (VAF) and calculate sttistics (VAF_stats), extracts Nomen (transcript, cNomen, pNomen...) from an HGVS field (e.g. snpEff, Annovar) with an optional list of personalized transcripts, generates VaRank format barcode, identify trio inheritance. + +Usage examples: + +> howard calculation --input=tests/data/example.full.vcf --output=/tmp/example.calculation.tsv --calculations='vartype' + +> howard calculation --input=tests/data/example.ann.vcf.gz --output=/tmp/example.calculated.tsv --calculations='snpeff_hgvs,NOMEN' --hgvs_field=snpeff_hgvs --transcripts=tests/data/transcripts.tsv + +> howard calculation --show_calculations + + + +### Main options +``` +--input= + +Input file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--output= + +Output file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--calculations= + +Calculations on genetic variants information and genotype information +Example: 'VARTYPE,barcode' +List of available calculations (unsensitive case, see doc for more information): + VARTYPE snpeff_hgvs FINDBYPIPELINE GENOTYPECONCORDANCE BARCODE TRIO VAF VAF_STATS DP_STATS +``` + +``` +--calculation_config= + +Calculation config file +Format: JSON +``` + +``` +--show_calculations + +Show available calculation operations +``` + +### NOMEN calculation +``` +--hgvs_field= (hgvs) + +HGVS INFO/tag containing a list o HGVS annotations +default: 'hgvs' +``` + +``` +--transcripts= + +Transcripts file in TSV format +Format: Transcript in first column, optional Gene in second column +default: None +``` + +### TRIO calculation +``` +--trio_pedigree= + +Pedigree Trio for trio inheritance calculation +Format: JSON file or dict (e.g. 'trio.ped.json', '{"father":"sample1", "mother":"sample2", "child":"sample3"}') +default: None +``` + + + +## HGVS +HGVS annotation using HUGO HGVS internation Sequence Variant Nomenclature (http://varnomen.hgvs.org/). Annotation refere to refGene and genome to generate HGVS nomenclature for all available transcripts. This annotation add 'hgvs' field into VCF INFO column of a VCF file. + +Usage examples: + +> howard hgvs --input=tests/data/example.full.vcf --output=/tmp/example.hgvs.vcf + + + +### Main options +``` +--input= | required + +Input file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--output= + +Output file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--assembly= (hg19) + +Default assembly +Default: 'hg19' +``` + +### HGVS +``` +--use_gene + +Use Gene information to generate HGVS annotation +Example: 'NM_152232(TAS1R2):c.231T>C' +``` + +``` +--use_exon + +Use Exon information to generate HGVS annotation +Only if 'use_gene' is not enabled +Example: 'NM_152232(exon2):c.231T>C' +``` + +``` +--use_protein + +Use Protein level to generate HGVS annotation +Can be used with 'use_exon' or 'use_gene' +Example: 'NP_689418:p.Cys77Arg' +``` + +``` +--add_protein + +Add Protein level to DNA HGVS annotation +Example: 'NM_152232:c.231T>C,NP_689418:p.Cys77Arg' +``` + +``` +--full_format + +Generates HGVS annotation in a full format (non-standard) +Full format use all information to generates an exhaustive annotation. +Use specifically 'use_exon' to add exon information. +Example: 'TAS1R2:NM_152232:NP_689418:c.231T>C:p.Cys77Arg' + 'TAS1R2:NM_152232:NP_689418:exon2:c.231T>C:p.Cys77Arg' +``` + +``` +--codon_type= ['1', '3', 'FULL'] (3) + +Amino Acide Codon format type to use to generate HGVS annotation +Available (default '3'): + '1': codon in 1 caracter (e.g. 'C', 'R') + '3': codon in 3 caracter (e.g. 'Cys', 'Arg') + 'FULL': codon in full name (e.g. 'Cysteine', 'Arginine') + +``` + +### Databases +``` +--refgene= + +refGene annotation file +``` + +``` +--refseqlink= + +refSeqLink annotation file +``` + +``` +--genomes-folder= (/databases/genomes/current) + +Folder containing genomes +Default: /databases/genomes/current +``` + + + +## PRIORITIZATION +Prioritization algorithm uses profiles to flag variants (as passed or filtered), calculate a prioritization score, and automatically generate a comment for each variants (example: 'polymorphism identified in dbSNP. associated to Lung Cancer. Found in ClinVar database'). Prioritization profiles are defined in a configuration file in JSON format. A profile is defined as a list of annotation/value, using wildcards and comparison options (contains, lower than, greater than, equal...). Annotations fields may be quality values (usually from callers, such as 'DP') or other annotations fields provided by annotations tools, such as HOWARD itself (example: COSMIC, Clinvar, 1000genomes, PolyPhen, SIFT). Multiple profiles can be used simultaneously, which is useful to define multiple validation/prioritization levels (example: 'standard', 'stringent', 'rare variants', 'low allele frequency'). + + + +Usage examples: + +> howard prioritization --input=tests/data/example.vcf.gz --output=/tmp/example.prioritized.vcf.gz --prioritizations=config/prioritization_profiles.json --profiles='default,GERMLINE' + + + +### Main options +``` +--input= | required + +Input file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--output= | required + +Output file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--prioritizations= | required + +Prioritization file in JSON format (defines profiles, see doc). +``` + +### Prioritization +``` +--profiles= + +Prioritization profiles to use (based on file in JSON). +default: all profiles available +``` + +``` +--default_profile= + +Prioritization profile by default (see doc) +default: First profile in JSON file +``` + +``` +--pzfields= (PZScore,PZFlag) + +Prioritization fields to provide (see doc). +available: PZScore, PZFlag, PZTags, PZComment, PZInfos +default: PZScore,PZFlag +``` + +``` +--prioritization_score_mode= ['HOWARD', 'VaRank'] (HOWARD) + +Prioritization Score mode (see doc). +available: HOWARD (increment score), VaRank (max score) +default: HOWARD +``` + + + +## QUERY +Query genetic variations in SQL format. Data can be loaded into 'variants' table from various formats (e.g. VCF, TSV, Parquet...). Using --explode_infos allow query on INFO/tag annotations. SQL query can also use external data within the request, such as a Parquet file(s). + +Usage examples: + +> howard query --input=tests/data/example.vcf.gz --query="SELECT * FROM variants WHERE REF = 'A' AND POS < 100000" + +> howard query --input=tests/data/example.vcf.gz --explode_infos --query='SELECT "#CHROM", POS, REF, ALT, DP, CLNSIG, sample2, sample3 FROM variants WHERE DP >= 50 OR CLNSIG NOT NULL ORDER BY DP DESC' + +> howard query --query="SELECT \"#CHROM\", POS, REF, ALT, \"INFO/Interpro_domain\" FROM 'tests/databases/annotations/hg19/dbnsfp42a.parquet' WHERE \"INFO/Interpro_domain\" NOT NULL ORDER BY \"INFO/SiPhy_29way_logOdds_rankscore\" DESC LIMIT 10" + +> howard query --explode_infos --explode_infos_prefix='INFO/' --query="SELECT \"#CHROM\", POS, REF, ALT, STRING_AGG(INFO, ';') AS INFO FROM 'tests/databases/annotations/hg19/*.parquet' GROUP BY \"#CHROM\", POS, REF, ALT" --output=/tmp/full_annotation.tsv && head -n2 /tmp/full_annotation.tsv + + + +### Main options +``` +--input= + +Input file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--output= + +Output file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--query= | required + +Query in SQL format +Format: SQL +Example: 'SELECT * FROM variants LIMIT 5' +``` + +### Explode infos +``` +--explode_infos + +Explode VCF INFO/Tag into 'variants' table columns. +default: False +``` + +``` +--explode_infos_prefix= + +Explode VCF INFO/Tag with a specific prefix. +default: '' +``` + +``` +--explode_infos_fields= (*) + +Explode VCF INFO/Tag specific fields/tags. +Keyword '*' specify all available fields, except those already specified. +Pattern (regex) can be used: '.*_score' for fields named with '_score' at the end. +Examples: + - 'HGVS,SIFT,Clinvar' (list of fields) + - 'HGVS,*,Clinvar' (list of fields with all other fields at the end) + - 'HGVS,.*_score,Clinvar' (list of 2 fields with all scores in the middle) + - 'HGVS,.*_score,*' (1 field, scores, all other fields) + - 'HGVS,*,.*_score' (1 field and all other fields, + scores included in other fields) +default: '*' +``` + +### Query +``` +--query_limit= (10) + +Limit of number of row for query (only for print result, not output). +default: 10 +``` + +``` +--query_print_mode= + +Print mode of query result (only for print result, not output). +Either None (native), 'markdown' or 'tabulate'. +default: None +``` + +### Output +``` +--include_header + +Include header (in VCF format) in output file. +Only for compatible formats (tab-delimiter format as TSV or BED). +default: False +``` + + + +## STATS +Statistics on genetic variations, such as: number of variants, number of samples, statistics by chromosome, genotypes by samples... + +Usage examples: + +> howard stats --input=tests/data/example.vcf.gz + +### Main options +``` +--input= | required + +Input file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--stats_md= + +Stats Output file in MarkDown format + +``` + +``` +--stats_json= + +Stats Output file in JSON format + +``` + + + +## CONVERT +Convert genetic variations file to another format. Multiple format are available, such as usual and official VCF and BCF format, but also other formats such as TSV, CSV, PSV and Parquet/duckDB. These formats need a header '.hdr' file to take advantage of the power of howard (especially through INFO/tag definition), and using howard convert tool automatically generate header file fo futher use. + +Usage examples: + +> howard convert --input=tests/data/example.vcf.gz --output=/tmp/example.tsv + +> howard convert --input=tests/data/example.vcf.gz --output=/tmp/example.parquet + +> howard convert --input=tests/data/example.vcf.gz --output=/tmp/example.tsv --explode_infos --explode_infos_fields='CLNSIG,SIFT,DP' --order_by='CLNSIG DESC, DP DESC' + +> howard convert --input=tests/data/example.vcf.gz --output=/tmp/example.tsv --explode_infos --explode_infos_prefix='INFO/' --explode_infos_fields='CLNSIG,SIFT,DP,*' --order_by='"INFO/CLNSIG" DESC, "INFO/DP" DESC' --include_header + +### Main options +``` +--input= | required + +Input file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--output= | required + +Output file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--explode_infos + +Explode VCF INFO/Tag into 'variants' table columns. +default: False +``` + +``` +--explode_infos_prefix= + +Explode VCF INFO/Tag with a specific prefix. +default: '' +``` + +``` +--explode_infos_fields= (*) + +Explode VCF INFO/Tag specific fields/tags. +Keyword '*' specify all available fields, except those already specified. +Pattern (regex) can be used: '.*_score' for fields named with '_score' at the end. +Examples: + - 'HGVS,SIFT,Clinvar' (list of fields) + - 'HGVS,*,Clinvar' (list of fields with all other fields at the end) + - 'HGVS,.*_score,Clinvar' (list of 2 fields with all scores in the middle) + - 'HGVS,.*_score,*' (1 field, scores, all other fields) + - 'HGVS,*,.*_score' (1 field and all other fields, + scores included in other fields) +default: '*' +``` + +``` +--order_by= + +List of columns to sort the result-set in ascending or descending order. +Use SQL format, and keywords ASC (ascending) and DESC (descending). +If a column is not available, order will not be considered. +Order is enable only for compatible format (e.g. TSV, CSV, JSON). +Examples: + - 'ACMG_score DESC' + - 'PZFlag DESC, PZScore DESC' +default: '' +``` + +``` +--include_header + +Include header (in VCF format) in output file. +Only for compatible formats (tab-delimiter format as TSV or BED). +default: False +``` + +``` +--parquet_partitions= + +Parquet partitioning using huve (only for Parquet export format). +This option is is faster parallel writing, but memory consuming. +Use 'None' (string) for NO partition but split parquet files into a folder +examples: '#CHROM', '#CHROM,REF', 'None' +default: None +``` + + + +## DATABASES +Download databases and needed files for howard and associated tools + +Usage examples: + +> howard databases --assembly=hg19 --download-genomes=/databases/genomes/current --download-genomes-provider=UCSC --download-genomes-contig-regex='chr[0-9XYM]+$' --download-annovar=/databases/annovar/current --download-annovar-files='refGene,cosmic70,nci60' --download-snpeff=/databases/snpeff/current --download-refseq=/databases/refseq/current --download-refseq-format-file='ncbiRefSeq.txt' --download-dbnsfp=/databases/dbnsfp/current --download-dbnsfp-release='4.4a' --download-dbnsfp-subdatabases --download-alphamissense=/databases/alphamissense/current --download-exomiser=/databases/exomiser/current --download-dbsnp=/databases/dbsnp/current --download-dbsnp-vcf --threads=8 + +> howard databases --generate-param=/tmp/param.json --generate-param-description=/tmp/test.description.json --generate-param-formats=parquet + +Notes: + +> - Downloading databases can take a while, depending on network, threads and memory + +> - Proxy: Beware of network and proxy configuration + +> - dbNSFP download: More threads, more memory usage (8 threads ~ 16Gb, 24 threads ~ 32Gb) + + + +### Main options +``` +--assembly= (hg19) + +Default assembly +Default: 'hg19' +``` + +``` +--genomes-folder= (/databases/genomes/current) + +Folder containing genomes +Default: /databases/genomes/current +``` + +### Genomes +``` +--download-genomes= + +Download Genomes within folder +``` + +``` +--download-genomes-provider= (UCSC) + +Download Genome from an external provider +Available: GENCODE, Ensembl, UCSC, NCBI +Default: UCSC + +``` + +``` +--download-genomes-contig-regex= + +Regular expression to select specific chromosome +Default: None +Example: 'chr[0-9XYM]+$' + +``` + +### snpEff +``` +--download-snpeff= + +Download snpEff databases within snpEff folder +``` + +### Annovar +``` +--download-annovar= + +Download Annovar databases within Annovar folder +``` + +``` +--download-annovar-files= + +Download Annovar databases for a list of Annovar file code (see Annovar Doc) +Default: All available files +Example: refGene,gnomad211_exome,cosmic70,clinvar_202*,nci60 +Note: refGene will be at leaset downloaded +Note2: Only file that not exists or with a different size will be downloaded +``` + +``` +--download-annovar-url= (http://www.openbioinformatics.org/annovar/download) + +Download Annovar databases URL (see Annovar Doc) +Default: 'http://www.openbioinformatics.org/annovar/download' +``` + +### refSeq +``` +--download-refseq= + +Download refSeq databases within refSeq folder +``` + +``` +--download-refseq-url= (http://hgdownload.soe.ucsc.edu/goldenPath) + +Download refSeq databases URL (see refSeq WebSite) +Default: 'http://hgdownload.soe.ucsc.edu/goldenPath' +``` + +``` +--download-refseq-prefix= (ncbiRefSeq) + +Check existing refSeq files in refSeq folder +Default: 'ncbiRefSeq' +``` + +``` +--download-refseq-files= (ncbiRefSeq.txt,ncbiRefSeqLink.txt) + +List of refSeq files to download +Default: 'ncbiRefSeq.txt,ncbiRefSeqLink.txt' +``` + +``` +--download-refseq-format-file= + +Name of refSeq file to format in BED format +Exemple: 'ncbiRefSeq.txt' +Default: None +``` + +``` +--download-refseq-include-utr5 + +Formating BED refSeq file including 5'UTR +``` + +``` +--download-refseq-include-utr3 + +Formating BED refSeq file including 3'UTR +``` + +``` +--download-refseq-include-chrM + +Formating BED refSeq file including Mitochondiral chromosome 'chrM' or 'chrMT' +``` + +``` +--download-refseq-include-non-canonical-chr + +Formating BED refSeq file including non canonical chromosomes +``` + +``` +--download-refseq-include-non-coding-transcripts + +Formating BED refSeq file including non coding transcripts +``` + +``` +--download-refseq-include-transcript-version + +Formating BED refSeq file including transcript version +``` + +### dbNSFP +``` +--download-dbnsfp= + +Download dbNSFP databases within dbNSFP folder +``` + +``` +--download-dbnsfp-url= (https://dbnsfp.s3.amazonaws.com) + +Download dbNSFP databases URL (see dbNSFP website) +Default: 'https://dbnsfp.s3.amazonaws.com' +``` + +``` +--download-dbnsfp-release= (4.4a) + +Release of dbNSFP to download (see dbNSFP website) +Default: '4.4a' +``` + +``` +--download-dbnsfp-parquet-size= (100) + +Maximum size (Mb) of data files in Parquet folder. +Parquet folder are partitioned (hive) by chromosome (sub-folder), +which contain N data files. +Default: 100 +``` + +``` +--download-dbnsfp-subdatabases + +Generate dbNSFP sub-databases +dbNSFP provides multiple databases which are split onto multiple columns. +This option create a Parquet folder for each sub-database (based on columns names). +``` + +``` +--download-dbnsfp-parquet + +Generate a Parquet file for each Parquet folder. +``` + +``` +--download-dbnsfp-vcf + +Generate a VCF file for each Parquet folder. +Note: Need genome (see --download-genome) +``` + +``` +--download-dbnsfp-no-files-all + +Not generate database Parquet/VCF file for the entire database ('ALL'). +Only sub-databases files will be generated. +(see '--download-dbnsfp-subdatabases') +``` + +``` +--download-dbnsfp-add-info + +Add INFO column (VCF format) in Parquet folder and file. +Useful for speed up full annotation (all available columns). +Increase memory and space during generation of files. +``` + +``` +--download-dbnsfp-row-group-size= (100000) + +minimum number of rows in a parquet row group (see duckDB doc). +Lower can reduce memory usage and slightly increase space during generation, +speed up highly selective queries, slow down whole file queries (e.g. aggregations) +Default: 100000 +``` + +### AlphaMissense +``` +--download-alphamissense= + +Download AlphaMissense databases within Annotations folder +``` + +``` +--download-alphamissense-url= (https://storage.googleapis.com/dm_alphamissense) + +Download AlphaMissense databases URL (see AlphaMissense website) +Default: 'https://storage.googleapis.com/dm_alphamissense' +``` + +### Exomiser +``` +--download-exomiser= + +Download Exomiser databases +Folder where the Exomiser databases will be downloaded and stored. +If the folder does not exist, it will be created. +``` + +``` +--download-exomiser-application-properties= + +Exomiser Application Properties configuration file (see Exomiser website) +This file contains configuration settings for the Exomiser tool. +If this parameter is not provided, the function will attempt to locate +the application properties file automatically based on the Exomiser. +Configuration information will be used to download expected releases (if no other parameters) +CADD and REMM will be downloaded only if 'path' are provided + +``` + +``` +--download-exomiser-url= (http://data.monarchinitiative.org/exomiser) + +URL where Exomiser database files can be downloaded from. +Default: 'http://data.monarchinitiative.org/exomiser' +``` + +``` +--download-exomiser-release= + +Release of Exomiser data to download. +If "default", "auto", or "config", retrieve from Application Properties file. +Default: None +``` + +``` +--download-exomiser-phenotype-release= + +Release of Exomiser phenotype to download. +If not provided, retrieve from Application Properties file or Exomiser data release +Default: None +``` + +``` +--download-exomiser-remm-release= + +Release of ReMM (Regulatory Mendelian Mutation) database to download. +If "default", "auto", or "config", retrieve from Application Properties file. +Default: None +``` + +``` +--download-exomiser-remm-url= (https://kircherlab.bihealth.org/download/ReMM) + +URL where ReMM (Regulatory Mendelian Mutation) database files can be downloaded from. +Default: 'https://kircherlab.bihealth.org/download/ReMM' +``` + +``` +--download-exomiser-cadd-release= + +Release of CADD (Combined Annotation Dependent Depletion) database to download. +If "default", "auto", or "config", retrieve from Application Properties file. +Default: None +``` + +``` +--download-exomiser-cadd-url= (https://kircherlab.bihealth.org/download/CADD) + +URL where CADD (Combined Annotation Dependent Depletion) database files can be downloaded from. +Default: 'https://kircherlab.bihealth.org/download/CADD' +``` + +``` +--download-exomiser-cadd-url-snv-file= (whole_genome_SNVs.tsv.gz) + +Name of the file containing the SNV (Single Nucleotide Variant) data +for the CADD (Combined Annotation Dependent Depletion) database. +Default: 'whole_genome_SNVs.tsv.gz' +``` + +``` +--download-exomiser-cadd-url-indel-file= (InDels.tsv.gz) + +Name of the file containing the INDEL (Insertion-Deletion) data +for the CADD (Combined Annotation Dependent Depletion) database. +Default: 'InDels.tsv.gz' +``` + +### dbSNP +``` +--download-dbsnp= + +Download dbSNP databases +Folder where the dbSNP databases will be downloaded and stored. +If the folder does not exist, it will be created. +``` + +``` +--download-dbsnp-releases= (b156) + +Release of dbSNP to download +Example: 'b152,b156'Default: 'b156' +``` + +``` +--download-dbsnp-release-default= + +Default Release of dbSNP ('default' symlink) +If None, first release to download will be assigned as default +only if it does not exists +Example: 'b156' +Default: None (first releases by default) +``` + +``` +--download-dbsnp-url= (https://ftp.ncbi.nih.gov/snp/archive) + +URL where dbSNP database files can be downloaded from. +Default: 'https://ftp.ncbi.nih.gov/snp/archive' +``` + +``` +--download-dbsnp-url-files= + +Dictionary that maps assembly names to specific dbSNP URL files. +It allows you to provide custom dbSNP URL files for specific assemblies +instead of using the default file naming convention +Default: None +``` + +``` +--download-dbsnp-url-files-prefix= (GCF_000001405) + +String that represents the prefix of the dbSNP file name for a specific assembly. +It is used to construct the full URL of the dbSNP file to be downloaded. +Default: 'GCF_000001405' +``` + +``` +--download-dbsnp-assemblies-map= ({'hg19': '25', 'hg38': '40'}) + +dictionary that maps assembly names to their corresponding dbSNP versions. +It is used to construct the dbSNP file name based on the assembly name. +Default: {"hg19": "25", "hg38": "40"} +``` + +``` +--download-dbsnp-vcf + +Generate well-formatted VCF from downloaded file: +- Add and filter contigs associated to assembly +- Normalize by splitting multiallelics - Need genome (see --download-genome) +``` + +``` +--download-dbsnp-parquet + +Generate Parquet file from VCF + +``` + +### HGMD +``` +--convert-hgmd= + +Convert HGMD databases +Folder where the HGMD databases will be stored. +Fields in VCF, Parquet and TSV will be generated. +If the folder does not exist, it will be created. +``` + +``` +--convert-hgmd-file= + +File from HGMD +Name format 'HGMD_Pro__.vcf.gz'. +``` + +``` +--convert-hgmd-basename= + +File output basename +Generated files will be prefixed by basename. +Example: 'HGMD_Pro_MY_RELEASE' +Default: Use input file name without '.vcf.gz' +``` + +### Parameters file +``` +--generate-param= + +Parameter file (JSON) with all databases found. +Databases folders scanned are defined in config file. +Structure of databases follow this structure (see doc): + ...////*.[parquet|vcf.gz|...] +``` + +``` +--generate-param-description= + +Description file (JSON) with all databases found. +Contains all databases with description of format, assembly, fields... +``` + +``` +--generate-param-releases= (current) + +List of database folder releases to check +Examples: 'current', 'latest' +Default: 'current' +``` + +``` +--generate-param-formats= (parquet) + +List of database formats to check (e.g. parquet, vcf, bed, tsv...) +Examples: 'parquet', 'parquet,vcf,bed,tsv' +Default: 'parquet' +``` + +``` +--generate-param-bcftools + +Generate parameter file with BCFTools annotation for allowed formats +Allowed formats with BCFTools: 'vcf', 'bed' +``` + + + +## FROM_ANNOVAR +(beta) Formatting Annovar database file to other format (VCF and Parquet). Exported Parquet file includes INFO/tags columns as VCF INFO columns had been exploded + +Usage examples: + +> howard from_annovar --input=tests/databases/others/hg19_nci60.txt --output=/tmp/nci60.from_annovar.vcf.gz --to_parquet=/tmp/nci60.from_annovar.parquet --annovar-code=nci60 --genome=/databases/genomes/current/hg19.fa --config=/tool/config/config.json --threads=8 + +### Main options +``` +--input= | required + +Input file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--output= | required + +Output file path +Format: BCF, VCF, TSV, CSV, PSV, Parquet or duckDB +Files can be compressesd (e.g. vcf.gz, tsv.gz) +``` + +``` +--genome= (hg19.fa) | required + +Genome file in fasta format +Default: 'hg19.fa' +``` + +### Annovar +``` +--annovar-code= + +Annovar code, or database name. Usefull to name databases columns +``` + +### Parquet +``` +--to_parquet= + +Parquet file conversion + +``` + +### Modes +``` +--reduce_memory= (auto) + +Reduce memory option +Values: 'auto' (auto-detection), 'enable', 'disable' +default: 'auto' +``` + +``` +--multi_variant= (auto) + +Variant with multiple annotation lines +Values: 'auto' (auto-detection), 'enable', 'disable' +default: 'auto' +``` + + + +## HELP +Help tools + +Usage examples: + +> howard help --help_md=/tmp/howard.help.mk --help_html=/tmp/howard.help.html + +### Main options +``` +--help_md= + +Help Output file in MarkDown format + +``` + +``` +--help_html= + +Help Output file in HTML format + +``` + + + +## Shared arguments +``` +--config= ({}) + +Configuration file +Default: {} +``` + +``` +--threads= (-1) + +Number of threads to use +Use -1 to detect number of CPU/cores +Default: -1 +``` + +``` +--memory= + +Memory to use (FLOAT[kMG]) +Default: None (80%% of RAM) +``` + +``` +--chunk_size= (1000000) + +Number of records in batch to export output file. +The lower the chunk size, the less memory consumption. +For Parquet partitioning, files size will depend on the chunk size. +default: 1000000 +``` + +``` +--tmp= + +Temporary folder. +Especially for duckDB, default '.tmp' (see doc). +default: None +``` + +``` +--duckdb_settings= + +DuckDB settings (see duckDB doc) as JSON (string or file). +These settings have priority (see options 'threads', 'tmp'...). +Examples: '{"TimeZone": "GMT", "temp_directory": "/tmp/duckdb", "threads": 8}' +default: None +``` + +``` +--verbosity= ['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'NOTSET'] (INFO) + +Verbosity level +Available: CRITICAL, ERROR, WARNING, INFO, DEBUG or NOTSET +Default: INFO +``` + +``` +--log= + +Logs file +Example: 'my.log' +Default: None +``` + +``` +--quiet + +==SUPPRESS== +``` + +``` +--verbose + +==SUPPRESS== +``` + +``` +--debug + +==SUPPRESS== +``` + diff --git a/howard/main.py b/howard/main.py index fcc1bbe..e45ea6e 100755 --- a/howard/main.py +++ b/howard/main.py @@ -16,6 +16,7 @@ import logging as log import sys import psutil +import markdown from howard.objects.variants import Variants from howard.objects.database import Database @@ -56,34 +57,84 @@ def main() -> None: subparsers = parser.add_subparsers(title="Tools", dest='command') + options_mk = "" + options_html = "" + + options_mk += f"# HOWARD Help\n" + options_mk += f"HOWARD Commands and Options\n" + options_html += f"

HOWARD Help - Commands and Options

\n" + options_html += f"

HOWARD Commands and Options

\n" + # Create commands arguments for command in commands_arguments: + + command_description = commands_arguments[command].get("description","") + command_help = commands_arguments[command].get("help","") + command_epilog = commands_arguments[command].get("epilog","") + command_parser = subparsers.add_parser( command, - description = commands_arguments[command].get("description",""), - help = commands_arguments[command].get("help",""), - epilog = commands_arguments[command].get("epilog",""), + description = command_description, + help = command_help, + epilog = command_epilog, formatter_class=argparse.RawTextHelpFormatter ) + + # Markdown + options_mk += f"## {command.upper()}\n" + options_mk += command_description.replace("\n","\n\n") + options_mk += "\n\n" + options_mk += re.sub(r'> $',"",command_epilog.replace("\n","\n\n").replace(" ","> ")) + options_mk += "\n\n" + + # HTML + options_html += f"

{command.upper()}

\n" + options_html += "

" + command_description.replace("\n","
") + "

" + options_html += command_epilog.replace("\n","
").replace(" ","   ") + + # Main args command_parser._optionals.title = "Options" if "main" in commands_arguments[command]["groups"]: - for arg in commands_arguments[command]["groups"]["main"]: - required = commands_arguments[command]["groups"]["main"][arg] - command_parser.add_argument(f"--{arg}", **get_argument(arguments=arguments, arg=arg, required=required)) + group = "main" + options_mk += f"### Main options\n" + options_html += f"

Main options

\n" + for arg in commands_arguments[command]["groups"][group]: + required = commands_arguments[command]["groups"][group][arg] + argument = get_argument(arguments=arguments.copy(), arg=arg, required=required) + command_parser.add_argument(f"--{arg}", **argument) + options_mk += get_argument_to_mk(arg, argument) + options_html += get_argument_to_mk(arg, argument, mode="html") for group in commands_arguments[command]["groups"]: if group != "main": + options_mk += f"### {group}\n" + options_html += f"

{group}

\n" command_group = command_parser.add_argument_group(f"{group} options") for arg in commands_arguments[command]["groups"][group]: required = commands_arguments[command]["groups"][group][arg] - command_group.add_argument(f"--{arg}", **get_argument(arguments=arguments, arg=arg, required=required)) + argument = get_argument(arguments=arguments.copy(), arg=arg, required=required) + command_group.add_argument(f"--{arg}", **argument) + options_mk += get_argument_to_mk(arg, argument) + options_html += get_argument_to_mk(arg, argument, mode="html") # Shared arguments shared_group = command_parser.add_argument_group('Shared options') for arg in shared_arguments: shared_group.add_argument(f"--{arg}", **get_argument(arguments=arguments, arg=arg, required=False)) + #options_mk += "> " + re.sub(r'> $',"",command_epilog.replace("\n","\n\n> ")) + options_mk += "\n\n" + + + options_mk += f"## Shared arguments\n" + options_html += f"

Shared arguments

\n".upper() + for arg in shared_arguments: + required = False + argument = get_argument(arguments=arguments.copy(), arg=arg, required=required) + options_mk += get_argument_to_mk(arg, argument) + options_html += get_argument_to_mk(arg, argument, mode="html") + # Parse args args, remaining = parser.parse_known_args() @@ -175,6 +226,20 @@ def main() -> None: if not args.command: parser.print_help() return + elif args.command == "help": + if "help_md" in args and args.help_md is not None: + help_file = args.help_md.name + f = open(help_file, "w") + f.write(options_mk) + f.close() + if "help_html" in args and args.help_html is not None: + help_file = args.help_html.name + f = open(help_file, "w") + f.write(options_html) + f.close() + else: + parser.print_help() + return else: command_function = commands_arguments[args.command]["function"] log.debug(f"Command/Tool: {command_function}") diff --git a/howard/tools/tools.py b/howard/tools/tools.py index b1f25f5..333e1c3 100644 --- a/howard/tools/tools.py +++ b/howard/tools/tools.py @@ -93,8 +93,7 @@ """ - Use 'ALL::'\n""" """ - e.g. 'ALL', 'ALL:parquet:current', 'ALL:parquet,vcf:devel'\n""" """For snpeff annotation, use keyword 'snpeff'\n""" - """For Annovar annotation, use keyword 'annovar' with annovar code (e.g. 'annovar:refGene', 'annovar:cosmic70')\n""" - , + """For Annovar annotation, use keyword 'annovar' with annovar code (e.g. 'annovar:refGene', 'annovar:cosmic70')""", "default": None, "gooey": { "widget": "MultiFileChooser" @@ -113,8 +112,7 @@ """ TRIO """ """ VAF """ """ VAF_STATS """ - """ DP_STATS """ - , + """ DP_STATS """, "default": None }, "prioritizations": { @@ -761,9 +759,9 @@ "download-dbsnp-release-default": { "metavar": "dnSNP release default", "help": """Default Release of dbSNP ('default' symlink)\n""" - """If None, first release to download will be assigned as dafault\n""" + """If None, first release to download will be assigned as default\n""" """only if it does not exists\n""" - """Example: 'b156'""" + """Example: 'b156'\n""" """Default: None (first releases by default)""", "required": False, "default": None @@ -898,6 +896,26 @@ "default": None }, + # Help + "help_md": { + "metavar": "help markdown", + "help": """Help Output file in MarkDown format\n""", + "required": False, + "type": argparse.FileType('w'), + "gooey": { + "widget": "FileSaver" + } + }, + "help_html": { + "metavar": "help html", + "help": """Help Output file in HTML format\n""", + "required": False, + "type": argparse.FileType('w'), + "gooey": { + "widget": "FileSaver" + } + }, + # Common "genomes-folder": { "metavar": "genomes", @@ -1332,6 +1350,19 @@ "multi_variant": False } } + }, + "help": { + "function" : "help_output", + "description": """Help tools""", + "help": """Help tools""", + "epilog": """Usage examples:\n""" + """ howard help --help_md=/tmp/howard.help.mk --help_html=/tmp/howard.help.html """, + "groups": { + "main": { + "help_md": False, + "help_html": False, + } + } } } @@ -1408,4 +1439,67 @@ def get_argument_gooey(arg:str): options = gooey_argument.get("options", {}) # Return - return widget, options \ No newline at end of file + return widget, options + + +# get argument +def get_argument_to_mk(arg:str, argument:dict = {}, mode:str = "mk") -> str: + """ + The function `get_argument_to_mk` generates a formatted string containing information about a + command line argument, which can be output in either Markdown or HTML format. + + :param arg: The `arg` parameter is a string that represents the name of the argument. It is used to + generate the header and text for the argument + :type arg: str + :param argument: The `argument` parameter is a dictionary that contains information about the + argument. It has the following keys: + :type argument: dict + :param mode: The `mode` parameter is used to specify the format of the output. It can have two + possible values: "mk" or "html". If "mk" is specified, the output will be formatted using Markdown + syntax. If "html" is specified, the output will be formatted using HTML syntax, defaults to mk + :type mode: str (optional) + :return: a formatted string that provides information about a command line argument. The format of + the string depends on the value of the `mode` parameter. If `mode` is set to "html", the string is + formatted as an HTML `
` block. Otherwise, the string is formatted as a Markdown code block. The
+    string includes the argument name, metavariable, help text, required
+    """
+
+    from html import escape
+
+    text = ""
+
+    # Option info
+    metavar = argument.get("metavar",arg)
+    help = argument.get("help",None)
+    required = argument.get("required",None)
+    choices = argument.get("choices",None)
+    default = argument.get("default",None)
+    action = argument.get("action",None)
+
+    # header
+    text_header = f"--{arg}"
+    if not action:
+        text_header += f"=<{metavar}>"
+    if choices:
+        text_header += f" {choices}"
+    if default:
+        text_header += f" ({default})"
+    if required:
+        text_header += " | required"
+    
+    # text
+    if mode == "html":
+        text += f"
"
+        text += escape(text_header)
+        text += "\n"
+        text += escape(help)
+        text += "\n\n"
+        text += f"
" + else: + text += f"```\n" + text += text_header + text += "\n\n" + text += help + text += "\n```\n\n" + + return text