phac-nml · mattheww95 · Aug 23, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/conf/modules.config b/conf/modules.config
@@ -159,6 +159,16 @@ process {
         errorStrategy = "ignore"
     }
 
+    withName: LOCIDEX_SELECT {
+        executor = 'local'
+        publishDir = [
+            mode: params.publish_dir_mode,
+            path: { ["${task.assembly_subtyping_directory_name}", "Locidex", "Select"].join(File.separator) },
+            pattern: "*.json",
+            saveAs: { filename -> filename.equals('versions.yml') ? null : reformat_output(filename, null, "locidex.db", meta) }
+        ]
+    }
+
     withName: REPORT_AGGREGATE {
         ext.parameters = params.python3
         cache = 'false' // Resume does not work on module, if enabled a warning is thrown
@@ -178,7 +188,6 @@ process {
         ]
     }
 
-
     withName: BIN_KRAKEN2 {
         ext.parameters = params.python3
         maxForks = 20

diff --git a/conf/test.config b/conf/test.config
@@ -33,7 +33,7 @@ params {
     r_contaminants.mega_mm2_idx = dehosting_idx
     kraken2_db = "${projectDir}/tests/data/kraken2/test"
     kraken.db = kraken2_db
-
+    locidex.allele_database = "${projectDir}/tests/data/databases/locidex_dbs"
     fastp.args.illumina = "-Q"
     min_reads = 100
 

diff --git a/modules/local/locidex_report.nf b/modules/local/locidex_report.nf
@@ -24,13 +24,13 @@ process LOCIDEX_REPORT {
     fi
     locidex report -i $seq_store_name -o . --name ${meta.id} \\
     --mode ${params.locidex.report_mode} \\
-    --prop ${params.locidex.report_prop} \\
     --max_ambig ${params.locidex.report_max_ambig} \\
     --max_stop ${params.locidex.report_max_stop} \\
+    --prop ${params.locidex.report_prop} \\
     --force
 
-    gzip -c profile.json > $output_name
-    rm profile.json
+    gzip -c report.json > $output_name
+    rm report.json
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/locidex_search.nf b/modules/local/locidex_search.nf
@@ -8,7 +8,6 @@ process LOCIDEX_SEARCH {
     label "process_medium"
     container "${workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer' ? task.ext.parameters.get('singularity') : task.ext.parameters.get('docker')}"
 
-
     input:
     tuple val(meta), path(fasta), path(db)
 
@@ -43,7 +42,7 @@ process LOCIDEX_SEARCH {
     --min_aa_match_cov ${params.locidex.min_aa_match_cov} \\
     --max_target_seqs ${params.locidex.max_target_seqs}
 
-    gzip -c seq_store.json > $output_json && rm seq_store.json
+    test -f seq_store.json && gzip -c seq_store.json > $output_json && rm seq_store.json
     test -f annotations.gbk && gzip -c annotations.gbk > $output_gbk && rm annotations.gbk
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/modules/local/locidex_select.nf b/modules/local/locidex_select.nf
@@ -0,0 +1,208 @@
+/*
+Locidex provides the option to select a database from a group of databases, by
+using a manifest file. To prevent copying all databases each time an allele database
+needs to be selected this file will read only the "manifest" file of the databases and
+to pick the correct allele scheme.
+
+
+Locidex Manifest is setup as:
+    {
+        db_name: [
+            {
+                "path": path/to/db/relative/to/manifest
+                # DB Config data, the newest db data will be selected as versions are not standardized
+                "config": {
+                    "db_name": "Locidex Database 1",
+                    "db_version": "1.0.0",
+                    "db_date": "yyyy-MM-dd",
+                    "db_author": "test1",
+                    "db_desc": "test1",
+                    "db_num_seqs": 53,
+                    "is_nucl": true,
+                    "is_prot": true,
+                    "nucleotide_db_name": "nucleotide",
+                    "protein_db_name": "protein"
+                }
+            }
+        ]
+    }
+*/
+
+import groovy.json.JsonSlurper
+import groovy.json.JsonBuilder
+import java.text.SimpleDateFormat
+
+
+process LOCIDEX_SELECT {
+    tag "$meta.id"
+    label "process_single"
+
+    input:
+    tuple val(meta), val(top_hit), val(contigs)
+    val manifest // This is a json file to be parsed
+
+    output:
+    tuple val(meta), val(contigs), val(scheme), val(paired_p), emit: db_data
+    tuple val(meta), path(output_config), emit: config_data
+
+    exec:
+    if(params.allele_scheme == null && params.locidex.allele_database == null){
+        exit 1, "Allele calling is enabled but there is no allele scheme or locidex allele database location present."
+    }
+
+    // Tokenize the "top_hit" or species value to identify all relevant match parts of the string
+    def species_data = top_hit.split('_|\s')
+    species_data = species_data*.toLowerCase()
+
+    // De-serialize the manifest file from the database location
+    def jsonSlurper = new JsonSlurper()
+    String json_data = manifest.text
+    def allele_db_data = jsonSlurper.parseText(json_data)
+    def allele_DB_KEYs = allele_db_data.keySet() as String[]
+
+    // Tokenize all database keys for lookup of species top hit in the database names
+    def databases = []
+    def shortest_entry = Integer.MAX_VALUE
+    for(allele_db in allele_DB_KEYs){
+        def db_tokens = allele_db.split('_|\s')
+        for(token in db_tokens){
+            def tok_size = token.size()
+            if(tok_size < shortest_entry){
+                shortest_entry = tok_size
+            }
+        }
+        databases.add(new Tuple(db_tokens*.toLowerCase(), allele_db))
+    }
+
+    def DB_TOKES_POS = 0
+    def DB_KEY = 1
+
+    // Remove spurious characters from tokenized string
+    species_data = species_data.findAll { it.size() >= shortest_entry }
+
+    // A default locidex database is set to null as there should be no option set
+    // a default database can be set, but this process will then be skipped
+    def db_opt = null
+
+
+    paired_p = false // Sets predicate for db identification as false by default
+    scheme = null
+    report_name = "${meta.id}_${params.locidex.db_config_output_name}"
+    output_config = task.workDir.resolve(report_name)
+
+    for(db in databases){
+        // TODO not getting best matches, currently
+        def match_size = db[DB_TOKES_POS].size() // Prevent single token matches
+        def tokens = window_string(species_data, match_size)
+        def db_found = compare_lists(db[DB_TOKES_POS], tokens)
+        if(db_found){
+            def selected_db = select_locidex_db_path(allele_db_data[db[DB_KEY]], db[DB_KEY])
+            /// Write selected allele database info to a file for the final report
+            write_config_data(selected_db, output_config)
+            scheme = join_database_paths(selected_db)
+            paired_p = db_found
+            break
+        }
+    }
+
+    if(!paired_p){
+        write_config_data(["No database selected."], output_config)
+    }
+
+}
+
+
+def write_config_data(db_data, output_name){
+    /// Config data for db to use
+    def json_data = new JsonBuilder(db_data).toPrettyString()
+    def output_file = file(output_name).newWriter()
+    output_file.write(json_data)
+    output_file.close()
+}
+
+def join_database_paths(db_path){
+    /// Database paths are relative to the manifest, hopefully this will not offer many issue on cloud executors
+    def input_dir_path = [params.lx_allele_database, db_path[params.locidex.manifest_db_path]].join(File.separator)
+    return input_dir_path
+}
+
+def select_locidex_db_path(db_values, db_name){
+    /// Select the optimal locidex database by parsing date fields for the organism
+    /// Database fields are labeled by date, so the most recent will be shown
+    /// Db value is an object containing the path fields and the config fields
+    /// db_values: is the list of database config information in the manifest
+
+
+    def database_entries = db_values.size()
+    def default_date = new SimpleDateFormat(params.locidex.date_format_string).parse("0001-01-01")
+    def max_date = default_date
+    def max_date_entry = null
+    def dates = []
+
+    // Validate all input fields
+    for(idx in 0..db_values.size()){
+        def db_entry = db_values[idx]
+        if(!db_entry.containsKey(params.locidex.manifest_db_path)){
+            exit 1, "Missing path value in locidex config for: ${db_name}"
+        }
+        if(!db_entry.containsKey(params.locidex.manifest_config_key)){
+            exit 1, "Missing config data for locidex database entry: ${db_name}"
+        }
+        if(!db_entry[params.locidex.manifest_config_key].containsKey(params.locidex.database_config_value_date)){
+            exit 1, "Missing date created value for locidex database entry: ${db_name}"
+        }
+        def date_value = db_entry[params.locidex.manifest_config_key][params.locidex.database_config_value_date]
+        def date_check = new SimpleDateFormat(params.locidex.date_format_string).parse(date_value)
+        dates.add(date_check)
+        if(date_check > max_date){
+            max_date = date_check
+            max_date_entry = db_entry
+        }
+    }
+
+    def max_date_count = dates.count(max_date)
+    if(max_date_count > 1){
+        exit 1, "There are multiple versions of the most recent database for ${db_name}. Mikrokondo could not determine the best database to pick."
+    }else if (max_date_count == 0){
+        exit 1, "There are not databases created after the year ${defualt_date}. Please set the allele database parameter, or adjust the date your database was created in the 'config.json'"
+    }else if (max_date_entry == null){
+        exit 1, "Could not select a database for locidex sample. ${meta.id}"
+    }
+    return max_date_entry
+}
+
+
+def window_string(species, match_size){
+    /*
+        Create an array of strings of a various match "match size" for comparison to a given value later one.
+
+        e.g. spieces is an array of: ["1", "2", "3", "4"] and match_size is 2 the output will be.
+            [
+                ["1", "2"],
+                ["2", "3"],
+                ["3", "4"]
+            ]
+    */
+    def tiles = []
+    def adj_match_size = match_size - 1
+    for(int spot = 0; spot < species.size()-adj_match_size; spot = spot + 1){
+        tiles.add(species[spot..spot + adj_match_size])
+    }
+    return tiles
+}
+
+def compare_lists(db_string_windows, species_tokens){
+    /* compare the various windows till the right db is found
+        The db_string is an array of [["1", "2"], ["2", "3"], ["3", "4"]] and the species tokens would be ["2", "3"]
+
+    TODO need to add a match size
+    */
+
+    for(window in db_string_windows){
+        if(window == species_tokens){
+            return true
+        }
+    }
+    return false
+}
+
diff --git a/modules/local/select_pointfinder.nf b/modules/local/select_pointfinder.nf
@@ -19,7 +19,7 @@ process IDENTIFY_POINTDB {
     def species_data = species.split('_|\s') // tokenize string
     species_data = species_data*.toLowerCase()
 
-    def overly_large_number = 100000
+    def overly_large_number = Integer.MAX_VALUE
     def databases = []
     // tokenize database options
     def shortest_entry = overly_large_number
@@ -57,6 +57,7 @@ process IDENTIFY_POINTDB {
 
 
 def tokenize_values(species, match_size){
+    // Create tiled values to match on, e.g. input is Salmonella enterica entrica -> [Salmonella, Salmonella enterica, Salmonella enterica enterica]
     def tokens = []
     def adj_match_size = match_size - 1
     for(int spot = 0; spot < species.size()-adj_match_size; spot = spot + 1){

diff --git a/nextflow.config b/nextflow.config
@@ -43,7 +43,7 @@ params {
     show_hidden_params = false
     validationS3PathCheck = true
     validationShowHiddenParams = false
-    validationSchemaIgnoreParams = 'top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
+    validationSchemaIgnoreParams = 'allele_scheme_selected,top_hit_method,abricate,locidex,assembly_status,bakta,bandage,checkm,chopper,contigs_too_short,coreutils,coverage_calc_fields,ectyper,fastp,fastqc,filtered_reads,flye,kat,kleborate,kraken,kraken_bin,kraken_species,lissero,mash,mash_meta,medaka,minimap2,mlst,mobsuite_recon,opt_platforms,pilon,pilon_iterative,pointfinder_db_tag,python3,QCReport,QCReport-fields,QCReportFields,quast,racon,raw_reads,report_aggregate,r_contaminants,samtools,seqkit,seqtk,seqtk_size,shigeifinder,sistr,spades,spatyper,staramr,subtyping_report,top_hit_species,unicycler'
     validationFailUnrecognisedParams = false // for the qcreport fields
 
     // SKIP options
@@ -128,6 +128,7 @@ params {
     lx_report_prop = "locus_name"
     lx_report_max_ambig = 0
     lx_report_max_stop = 0
+    lx_allele_database = null
 
     // Overide an allele calling scheme, this will be applied globally if auto selection is not opted for
     allele_scheme = null
@@ -214,9 +215,8 @@ params {
 
     locidex {
         // awaiting singluarity image build
-        //singularity = "https://depot.galaxyproject.org/singularity/locidex%3A0.1.1--pyhdfd78af_1"
-        singularity = "quay.io/biocontainers/locidex:0.1.1--pyhdfd78af_1"
-        docker = "quay.io/biocontainers/locidex:0.1.1--pyhdfd78af_1"
+        singularity = "docker.io/mwells14/locidex:0.2.2"
+        docker = "docker.io/mwells14/locidex:0.2.2"
         min_evalue = params.lx_min_evalue
         min_dna_len = params.lx_min_dna_len
         min_aa_len = params.lx_min_aa_len
@@ -232,17 +232,23 @@ params {
         report_prop = params.lx_report_prop
         report_max_ambig = params.lx_report_max_ambig
         report_max_stop = params.lx_report_max_stop
+        allele_database = params.lx_allele_database
+        date_format_string = "yyyy-MM-dd"
+        manifest_db_path = "path"
+        manifest_config_key = "config"
+        manifest_name = "manifest.json"
+        database_config_value_date = "db_date"
         extracted_seqs_suffix = ".extracted.seqs.fasta.gz"
         seq_store_suffix = ".seq_store.json.gz"
         gbk_suffix = ".gbk.gz"
         extraction_dir = "extracted"
         report_suffix = ".profile.mlst.json.gz"
-        schemes {
-            salmonella {
-                search = params.QCReport.salmonella
-                db = null
-            }
-        }
+        db_config_output_name = "SelectedLocidexConfig.json"
+        report_tag = "LocidexDatabaseInformation"
+    }
+
+    allele_scheme_selected {
+        report_tag = "AlleleSchemeUsed"
     }
 
     // FASTP options

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -664,6 +664,13 @@
                     "default": 0,
                     "description": "Maximum number of internal stop codons allowed in a sequence.",
                     "minimum": 0
+                },
+                "lx_allele_database": {
+                    "type": "string",
+                    "description": "Folder of locidex databases. The folder should contain a 'manifest.json' file created by locidex",
+                    "pattern": "^\\S+$",
+                    "exists": true,
+                    "format": "directory-path"
                 }
             }
         },