Merge pull request #135 from EBI-Metagenomics/fix/virsorter2

Fix/virsorter2
EBI-Metagenomics · Nov 8, 2024 · ac06be8 · ac06be8
2 parents 0c20229 + 43e118b
commit ac06be8
Show file tree

Hide file tree

Showing 24 changed files with 5,731 additions and 51 deletions.
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -32,5 +32,5 @@ jobs:
     - name: Unit tests
       run: |
         # TODO, improve the pythonpath handling
-        export PYTHONPATH=$PYTHONPATH:bin 
+        export PYTHONPATH="$PYTHONPATH:bin" 
         python -m unittest discover tests
diff --git a/bin/parse_viral_pred.py b/bin/parse_viral_pred.py
@@ -186,7 +186,91 @@ def parse_virus_sorter(sorter_files):
     return high_confidence, low_confidence, prophages
 
 
-def merge_annotations(pprmeta, finder, sorter, assembly):
+def parse_virus_sorter2(sorter_files, vs_cutoff):
+    """Extract high, low and prophages confidence Records from virus sorter results.
+    High confidence are contigs with confidence score >= confidence cutoff (0.9 per default).
+    Low confidence are contigs with confidence score < confidence cutoff and > 0.5.
+    Putative prophages are contigs with partial viral sequences. According to the VirSorter2 author
+    partial viral sequences can only occur as prophages, full viral sequences can occur due to prophages or other origins though.
+    
+    VirSorter2 NOTE
+    Note that suffix ||full, ||lt2gene and ||{i}_partial ({i} can be numbers starting from 0 to max number 
+    of viral fragments found in that contig) have been added to original sequence names to differentiate 
+    sub-sequences in case of multiple viral subsequences found in one contig. 
+    Partial sequences can be treated as proviruses since they are extracted from longer host sequences. 
+    Full sequences, however, can be proviruses or free virus since it can be a short fragment sequenced from 
+    a provirus region. Moreover, "full" sequences are just sequences with strong viral signal as a whole 
+    ("nearly full" is more accurate). They might be trimmed due to partial gene overhang at ends, 
+    duplicate segments from circular genomes, and an end trimming step for all identified viral sequences 
+    to find the optimal viral segments (longest within 95% of peak score by default). Again, the "full" sequences 
+    trimmed by the end trimming step should not be interpreted as provirus, since genes that have low impact on score, 
+    such as unknown gene or genes shared by host and virus, could be trimmed. If you prefer the full sequences 
+    (ending with ||full) not to be trimmed and leave it to specialized tools such as checkV, 
+    you can use --keep-original-seq option.
+    """
+
+    boundary_columns = ['seqname', 'trim_orf_index_start', 'trim_orf_index_end', 
+                        'trim_bp_start', 'trim_bp_end', 'trim_pr', 
+                        'partial', 'pr_full', 'hallmark_cnt', 'group', 'shape']
+
+    boundary_dtypes = {'seqname': 'object', 'trim_orf_index_start': 'int64', 'trim_orf_index_end': 'int64',
+                       'trim_bp_start': 'int64', 'trim_bp_end': 'int64', 'trim_pr': 'float64',
+                       'partial': 'int64', 'pr_full': 'float64', 'hallmark_cnt': 'int64', 'group': 'object', 'shape': 'object'}
+
+    high_confidence = dict()
+    low_confidence = dict()
+    prophages = dict()
+
+    final_boundary_file, final_score_file, final_combined_fa_file = "", "", ""
+    for sorter_results_file in sorter_files:
+        if "final-viral-boundary.tsv" in sorter_results_file:
+            final_boundary_file = sorter_results_file
+        elif "final-viral-score.tsv" in sorter_results_file:
+            final_score_file = sorter_results_file
+        elif "final-viral-combined.fa" in sorter_results_file:
+            final_combined_fa_file = sorter_results_file
+        else:
+            print('ERROR: The result files of VirSorter2 are incomplete. The code expects the files final-viral-{boundary,score}.tsv and final-viral-combined.fa.', file=sys.stderr)
+            return high_confidence, low_confidence, prophages
+
+    boundary_df = pd.read_csv(final_boundary_file, sep='\t', index_col='seqname', usecols=boundary_columns, 
+                              dtype=boundary_dtypes)
+    score_df = pd.read_csv(final_score_file, sep='\t')
+    score_df['seqname'] = score_df['seqname'].str.replace(r'\|\|.*', '', regex=True)
+    score_df.set_index('seqname', inplace=True)
+
+    meta = score_df.merge(boundary_df, left_index=True, right_index=True, how='left', 
+                          suffixes=('_score', '_boundary')).dropna(subset=['shape'])
+
+    for record in SeqIO.parse(final_combined_fa_file, "fasta"):
+        clean_name = record.id.split('|')[0]
+        max_score = meta['max_score'].get(clean_name, None)
+        circular = meta['shape'].get(clean_name, None)
+        prange = [meta['trim_bp_start'].get(clean_name, None), meta['trim_bp_end'].get(clean_name, None)]
+        if not circular or not max_score:
+            continue
+        circular = circular.startswith('circular')
+
+        if 'partial' in record.id:
+            # add the prophage position within the contig
+            record.id = clean_name
+            prophages.setdefault(clean_name, []).append(
+                Record(record, "prophage", circular, prange)
+            )
+        record.id = clean_name
+        if float(max_score) >= float(vs_cutoff):
+            high_confidence[record.id] = Record(record, "high_confidence", circular)
+        else:
+            low_confidence[record.id] = Record(record, "low_confidence", circular)
+
+    print(f"Virus Sorter found {len(high_confidence)} high confidence contigs.")
+    print(f"Virus Sorter found {len(low_confidence)} low confidence contigs.")
+    print(f"Virus Sorter found {len(prophages)} putative prophages contigs.")
+
+    return high_confidence, low_confidence, prophages
+
+
+def merge_annotations(pprmeta, finder, sorter, sorter2, assembly, vs_cutoff):
     """Parse VirSorter, VirFinder and PPR-Meta outputs and merge the results.
     High confidence viral contigs:
     -  VirSorter reported as categories 1 and 2
@@ -206,26 +290,46 @@ def merge_annotations(pprmeta, finder, sorter, assembly):
 
     pprmeta_lc = parse_pprmeta(pprmeta)
     finder_lc, finder_lowestc = parse_virus_finder(finder)
-    sorter_hc, sorter_lc, sorter_prophages = parse_virus_sorter(sorter)
+    sorter_hc, sorter_lc, sorter_prophages = parse_virus_sorter(sorter) if sorter is not None else parse_virus_sorter2(
+        sorter2, vs_cutoff)
 
     for seq_record in SeqIO.parse(assembly, "fasta"):
-        # HC
-        if seq_record.id in sorter_hc:
-            hc_predictions_contigs.append(sorter_hc.get(seq_record.id).get_seq_record())
-        # Pro
-        elif seq_record.id in sorter_prophages:
-            # a contig may have several prophages
-            # for prophages write the record as it holds the
-            # sliced fasta
-            for record in sorter_prophages.get(seq_record.id):
-                prophage_predictions_contigs.append(record.get_seq_record())
-        # LC
-        elif seq_record.id in finder_lc:
-            lc_predictions_contigs.append(seq_record)
-        elif seq_record.id in sorter_lc and seq_record.id in finder_lowestc:
-            lc_predictions_contigs.append(sorter_lc.get(seq_record.id).get_seq_record())
-        elif seq_record.id in pprmeta_lc and seq_record.id in finder_lowestc:
-            lc_predictions_contigs.append(seq_record)
+        if sorter:
+            if seq_record.id in sorter_hc:
+                hc_predictions_contigs.append(sorter_hc.get(seq_record.id).get_seq_record())
+                # Pro
+            elif seq_record.id in sorter_prophages:
+                # a contig may have several prophages
+                # for prophages write the record as it holds the
+                # sliced fasta
+                for record in sorter_prophages.get(seq_record.id):
+                    prophage_predictions_contigs.append(record.get_seq_record())
+                # LC
+            elif seq_record.id in finder_lc:
+                lc_predictions_contigs.append(seq_record)
+            elif seq_record.id in sorter_lc and seq_record.id in finder_lowestc:
+                lc_predictions_contigs.append(sorter_lc.get(seq_record.id).get_seq_record())
+            elif seq_record.id in pprmeta_lc and seq_record.id in finder_lowestc:
+                lc_predictions_contigs.append(seq_record)
+        else:
+            # Pro
+            # TODO: check this decision because for VirSorter2 sequence can be in 2 categories
+            if seq_record.id in sorter_prophages:
+                # a contig may have several prophages
+                # for prophages write the record as it holds the
+                # sliced fasta
+                for record in sorter_prophages.get(seq_record.id):
+                    prophage_predictions_contigs.append(record.get_seq_record())
+            # HC
+            if seq_record.id in sorter_hc:
+                hc_predictions_contigs.append(sorter_hc.get(seq_record.id).get_seq_record())
+            # LC
+            elif seq_record.id in finder_lc:
+                lc_predictions_contigs.append(seq_record)
+            elif seq_record.id in sorter_lc and seq_record.id in finder_lowestc:
+                lc_predictions_contigs.append(sorter_lc.get(seq_record.id).get_seq_record())
+            elif seq_record.id in pprmeta_lc and seq_record.id in finder_lowestc:
+                lc_predictions_contigs.append(seq_record)
 
     return (
         hc_predictions_contigs,
@@ -237,7 +341,7 @@ def merge_annotations(pprmeta, finder, sorter, assembly):
     )
 
 
-def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
+def main(pprmeta, finder, sorter, sorter2, assembly, outdir, vs_cutoff, prefix=False):
     """Parse VirSorter, VirFinder and PPR-Meta outputs and merge the results."""
     (
         hc_contigs,
@@ -246,7 +350,7 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
         sorter_hc,
         sorter_lc,
         sorter_prophages,
-    ) = merge_annotations(pprmeta, finder, sorter, assembly)
+    ) = merge_annotations(pprmeta, finder, sorter, sorter2, assembly, vs_cutoff)
 
     at_least_one = False
     name_prefix = ""
@@ -262,18 +366,26 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
             "fasta",
         )
         at_least_one = True
+    else:
+        print('No high confidence viral contigs found, skipping output file.')
+
     if len(lc_contigs):
         SeqIO.write(
             lc_contigs,
             outdir / Path(name_prefix + "low_confidence_viral_contigs.fna"),
             "fasta",
         )
         at_least_one = True
+    else:
+        print('No low confidence viral contigs found, skipping output file.')
+
     if len(prophage_contigs):
         SeqIO.write(
             prophage_contigs, outdir / Path(name_prefix + "prophages.fna"), "fasta"
         )
         at_least_one = True
+    else:
+        print('No prophage contigs found, skipping output file.')
 
     # VirSorter provides some metadata on each annotation
     # - is circular
@@ -335,13 +447,29 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
         help="VirSorter .fasta files (i.e. VIRSorter_cat-1.fasta)" " VirSorter output",
         required=False,
     )
+    parser.add_argument(
+        "-z",
+        "--vs2files",
+        dest="sorter2",
+        nargs="+",
+        help='VirSorter2 .tsv files (i.e. final-viral-{boundary,score}.tsv, final-viral-combined.fa)' " VirSorter2 output",
+        required=False,
+    )
     parser.add_argument(
         "-p",
         "--pmout",
         dest="pprmeta",
         help="Absolute or relative path to PPR-Meta output file" " PPR-Meta output",
         required=False,
     )
+    parser.add_argument(
+        "-y",
+        "--vs_cutoff",
+        dest="vs_cutoff",
+        help="Cutoff to categorize sequences identified by VirSorter2 to high or low confidence (default: 0.9).",
+        default=0.9,
+        type=float,
+    )
     parser.add_argument(
         "-r",
         "--prefix",
@@ -363,7 +491,9 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
         args.pprmeta,
         args.finder,
         args.sorter,
+        args.sorter2,
         args.assembly,
         args.outdir,
+        args.vs_cutoff,
         prefix=args.prefix,
     )
diff --git a/bin/write_viral_gff.py b/bin/write_viral_gff.py
@@ -52,6 +52,8 @@ def aggregate_annotations(virify_annotation_files):
     cds_annotations = {}
 
     for virify_summary in virify_annotation_files:
+        if 'taxonomy' in virify_summary:
+            continue
         with open(virify_summary, "r") as table_handle:
             csv_reader = csv.DictReader(table_handle, delimiter="\t")
             for row in csv_reader:

diff --git a/configs/modules.config b/configs/modules.config
@@ -783,6 +783,32 @@ process {
         ]
     }
 
+    withName: VIRSORTER2 {
+        ext.args = " --use-conda-off "
+        // TODO remove that arg for conda env
+        //if (!params.conda.enabled) {
+        //    ext.args = "  "
+        //}
+
+        publishDir = [
+            [
+                path: "${params.output}",
+                saveAs: {
+                    filename -> {
+                        if ( filename.equals('versions.yml') ) {
+                            return null;
+                        }
+                        def output_file = new File(filename);
+                        return "${meta.id}/${params.virusdir}/${output_file.name}";
+                    }
+                },
+                mode: params.publish_dir_mode,
+                failOnError: false,
+                pattern: "virsorter2/*.{tsv,fasta}"
+            ]
+        ]
+    }
+
     withName: WRITE_GFF {
         publishDir = [
             [

diff --git a/modules/local/get_db/virsorter2.nf b/modules/local/get_db/virsorter2.nf
@@ -0,0 +1,24 @@
+process virsorter2GetDB {
+  label 'virsorter2'    
+  if (params.cloudProcess) { 
+    publishDir "${params.databases}/virsorter2/", mode: 'copy', pattern: "virsorter2-data" 
+  }
+  else { 
+    storeDir "${params.databases}/virsorter2/" 
+  }  
+
+  output:
+    path("virsorter2-data/db", type: 'dir')
+
+  script:
+    """
+    # just in case there is a failed attemp before; 
+    # remove the whole diretory specified by -d
+    rm -rf virsorter2-data
+    
+    # download virsorter2 database and extract
+    wget https://osf.io/v46sc/download
+    mkdir virsorter2-data
+    tar -xzf download -C virsorter2-data
+    """
+}
diff --git a/modules/local/help.nf b/modules/local/help.nf
@@ -29,6 +29,7 @@ def helpMSG() {
 
     ${c_yellow}Databases (automatically downloaded by default):${c_reset}
     --virsorter         a virsorter database provided as 'virsorter/virsorter-data' [default: $params.virsorter]
+    --virsorter2        a virsorter2 database provided as 'virsorter2/virsorter2-data' [default: $params.virsorter2]
     --virfinder         a virfinder model [default: $params.virfinder]
     --viphog            the ViPhOG database, hmmpress'ed [default: $params.viphog]
     --rvdb              the RVDB, hmmpress'ed [default: $params.rvdb]
@@ -47,6 +48,7 @@ def helpMSG() {
         --rvdb /path/to/your/rvdb
 
     ${c_yellow}Parameters:${c_reset}
+    --use_virsorter_v1  Use VirSorter_v1 instead of default VirSorter2
     --evalue            E-value used to filter ViPhOG hits in the ratio_evalue step [default: $params.evalue]
     --prop              Minimum proportion of proteins with ViPhOG annotations to provide a taxonomic assignment [default: $params.prop]
     --taxthres          Minimum proportion of annotated genes required for taxonomic assignment [default: $params.taxthres]

diff --git a/modules/local/parse/main.nf b/modules/local/parse/main.nf
@@ -13,6 +13,12 @@ process PARSE {
       tuple val(meta), path("*.fna"), path('virsorter_metadata.tsv'), path("${meta.id}_virus_predictions.log"), optional: true
 
     script:
+    if (!params.use_virsorter_v1)
+    """
+    touch virsorter_metadata.tsv
+    parse_viral_pred.py -a ${fasta} -f ${virfinder} -p ${pprmeta} -z ${virsorter} &> ${meta.id}_virus_predictions.log
+    """
+    else
     """
     touch virsorter_metadata.tsv
     parse_viral_pred.py -a ${fasta} -f ${virfinder} -p ${pprmeta} -s ${virsorter}/Predicted_viral_sequences/*.fasta &> ${meta.id}_virus_predictions.log
@@ -22,7 +28,7 @@ process PARSE {
 /*
   usage: parse_viral_pred.py [-h] -a ASSEMB -f FINDER -s SORTER [-o OUTDIR]
 
-  description: script generates three output_files: High_confidence.fasta, Low_confidence.fasta, Prophages.fasta
+  description: script generates three output_files: High_confidence.fna, Low_confidence.fna, Prophages.fna
 
   optional arguments:
   -h, --help            show this help message and exit
@@ -38,4 +44,6 @@ process PARSE {
   -o OUTDIR, --outdir OUTDIR
                         Absolute or relative path of directory where output
                         viral prediction files should be stored (default: cwd)
+                        
+  NOTE: only outputs .fna files if some low confidence or high confidence viral seqs were identified, otherwise outputs nothing.
 */
diff --git a/modules/local/utils.nf b/modules/local/utils.nf
@@ -0,0 +1,19 @@
+process CONCATENATE_VIRSORTER2_FILES {
+    tag "${meta.id}"
+    label "process_medium"
+
+    input:
+    tuple val(meta), path(input_files, stageAs: "inputs/*")
+    val output_name
+
+    output:
+    tuple val(meta), path("${output_name}"), emit: concatenated_result
+
+    script:
+    """
+    export first_file=\$(ls inputs | head -n 1);
+    grep 'seqname' inputs/\${first_file} > header.tsv || true
+    cat inputs/* | grep -v 'seqname' > without_header.${output_name}
+    cat header.tsv without_header.${output_name} > ${output_name}
+    """
+}