Skip to content

Commit

Permalink
Merge pull request #135 from EBI-Metagenomics/fix/virsorter2
Browse files Browse the repository at this point in the history
Fix/virsorter2
  • Loading branch information
KateSakharova authored Nov 8, 2024
2 parents 0c20229 + 43e118b commit ac06be8
Show file tree
Hide file tree
Showing 24 changed files with 5,731 additions and 51 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@ jobs:
- name: Unit tests
run: |
# TODO, improve the pythonpath handling
export PYTHONPATH=$PYTHONPATH:bin
export PYTHONPATH="$PYTHONPATH:bin"
python -m unittest discover tests
172 changes: 151 additions & 21 deletions bin/parse_viral_pred.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,91 @@ def parse_virus_sorter(sorter_files):
return high_confidence, low_confidence, prophages


def merge_annotations(pprmeta, finder, sorter, assembly):
def parse_virus_sorter2(sorter_files, vs_cutoff):
"""Extract high, low and prophages confidence Records from virus sorter results.
High confidence are contigs with confidence score >= confidence cutoff (0.9 per default).
Low confidence are contigs with confidence score < confidence cutoff and > 0.5.
Putative prophages are contigs with partial viral sequences. According to the VirSorter2 author
partial viral sequences can only occur as prophages, full viral sequences can occur due to prophages or other origins though.
VirSorter2 NOTE
Note that suffix ||full, ||lt2gene and ||{i}_partial ({i} can be numbers starting from 0 to max number
of viral fragments found in that contig) have been added to original sequence names to differentiate
sub-sequences in case of multiple viral subsequences found in one contig.
Partial sequences can be treated as proviruses since they are extracted from longer host sequences.
Full sequences, however, can be proviruses or free virus since it can be a short fragment sequenced from
a provirus region. Moreover, "full" sequences are just sequences with strong viral signal as a whole
("nearly full" is more accurate). They might be trimmed due to partial gene overhang at ends,
duplicate segments from circular genomes, and an end trimming step for all identified viral sequences
to find the optimal viral segments (longest within 95% of peak score by default). Again, the "full" sequences
trimmed by the end trimming step should not be interpreted as provirus, since genes that have low impact on score,
such as unknown gene or genes shared by host and virus, could be trimmed. If you prefer the full sequences
(ending with ||full) not to be trimmed and leave it to specialized tools such as checkV,
you can use --keep-original-seq option.
"""

boundary_columns = ['seqname', 'trim_orf_index_start', 'trim_orf_index_end',
'trim_bp_start', 'trim_bp_end', 'trim_pr',
'partial', 'pr_full', 'hallmark_cnt', 'group', 'shape']

boundary_dtypes = {'seqname': 'object', 'trim_orf_index_start': 'int64', 'trim_orf_index_end': 'int64',
'trim_bp_start': 'int64', 'trim_bp_end': 'int64', 'trim_pr': 'float64',
'partial': 'int64', 'pr_full': 'float64', 'hallmark_cnt': 'int64', 'group': 'object', 'shape': 'object'}

high_confidence = dict()
low_confidence = dict()
prophages = dict()

final_boundary_file, final_score_file, final_combined_fa_file = "", "", ""
for sorter_results_file in sorter_files:
if "final-viral-boundary.tsv" in sorter_results_file:
final_boundary_file = sorter_results_file
elif "final-viral-score.tsv" in sorter_results_file:
final_score_file = sorter_results_file
elif "final-viral-combined.fa" in sorter_results_file:
final_combined_fa_file = sorter_results_file
else:
print('ERROR: The result files of VirSorter2 are incomplete. The code expects the files final-viral-{boundary,score}.tsv and final-viral-combined.fa.', file=sys.stderr)
return high_confidence, low_confidence, prophages

boundary_df = pd.read_csv(final_boundary_file, sep='\t', index_col='seqname', usecols=boundary_columns,
dtype=boundary_dtypes)
score_df = pd.read_csv(final_score_file, sep='\t')
score_df['seqname'] = score_df['seqname'].str.replace(r'\|\|.*', '', regex=True)
score_df.set_index('seqname', inplace=True)

meta = score_df.merge(boundary_df, left_index=True, right_index=True, how='left',
suffixes=('_score', '_boundary')).dropna(subset=['shape'])

for record in SeqIO.parse(final_combined_fa_file, "fasta"):
clean_name = record.id.split('|')[0]
max_score = meta['max_score'].get(clean_name, None)
circular = meta['shape'].get(clean_name, None)
prange = [meta['trim_bp_start'].get(clean_name, None), meta['trim_bp_end'].get(clean_name, None)]
if not circular or not max_score:
continue
circular = circular.startswith('circular')

if 'partial' in record.id:
# add the prophage position within the contig
record.id = clean_name
prophages.setdefault(clean_name, []).append(
Record(record, "prophage", circular, prange)
)
record.id = clean_name
if float(max_score) >= float(vs_cutoff):
high_confidence[record.id] = Record(record, "high_confidence", circular)
else:
low_confidence[record.id] = Record(record, "low_confidence", circular)

print(f"Virus Sorter found {len(high_confidence)} high confidence contigs.")
print(f"Virus Sorter found {len(low_confidence)} low confidence contigs.")
print(f"Virus Sorter found {len(prophages)} putative prophages contigs.")

return high_confidence, low_confidence, prophages


def merge_annotations(pprmeta, finder, sorter, sorter2, assembly, vs_cutoff):
"""Parse VirSorter, VirFinder and PPR-Meta outputs and merge the results.
High confidence viral contigs:
- VirSorter reported as categories 1 and 2
Expand All @@ -206,26 +290,46 @@ def merge_annotations(pprmeta, finder, sorter, assembly):

pprmeta_lc = parse_pprmeta(pprmeta)
finder_lc, finder_lowestc = parse_virus_finder(finder)
sorter_hc, sorter_lc, sorter_prophages = parse_virus_sorter(sorter)
sorter_hc, sorter_lc, sorter_prophages = parse_virus_sorter(sorter) if sorter is not None else parse_virus_sorter2(
sorter2, vs_cutoff)

for seq_record in SeqIO.parse(assembly, "fasta"):
# HC
if seq_record.id in sorter_hc:
hc_predictions_contigs.append(sorter_hc.get(seq_record.id).get_seq_record())
# Pro
elif seq_record.id in sorter_prophages:
# a contig may have several prophages
# for prophages write the record as it holds the
# sliced fasta
for record in sorter_prophages.get(seq_record.id):
prophage_predictions_contigs.append(record.get_seq_record())
# LC
elif seq_record.id in finder_lc:
lc_predictions_contigs.append(seq_record)
elif seq_record.id in sorter_lc and seq_record.id in finder_lowestc:
lc_predictions_contigs.append(sorter_lc.get(seq_record.id).get_seq_record())
elif seq_record.id in pprmeta_lc and seq_record.id in finder_lowestc:
lc_predictions_contigs.append(seq_record)
if sorter:
if seq_record.id in sorter_hc:
hc_predictions_contigs.append(sorter_hc.get(seq_record.id).get_seq_record())
# Pro
elif seq_record.id in sorter_prophages:
# a contig may have several prophages
# for prophages write the record as it holds the
# sliced fasta
for record in sorter_prophages.get(seq_record.id):
prophage_predictions_contigs.append(record.get_seq_record())
# LC
elif seq_record.id in finder_lc:
lc_predictions_contigs.append(seq_record)
elif seq_record.id in sorter_lc and seq_record.id in finder_lowestc:
lc_predictions_contigs.append(sorter_lc.get(seq_record.id).get_seq_record())
elif seq_record.id in pprmeta_lc and seq_record.id in finder_lowestc:
lc_predictions_contigs.append(seq_record)
else:
# Pro
# TODO: check this decision because for VirSorter2 sequence can be in 2 categories
if seq_record.id in sorter_prophages:
# a contig may have several prophages
# for prophages write the record as it holds the
# sliced fasta
for record in sorter_prophages.get(seq_record.id):
prophage_predictions_contigs.append(record.get_seq_record())
# HC
if seq_record.id in sorter_hc:
hc_predictions_contigs.append(sorter_hc.get(seq_record.id).get_seq_record())
# LC
elif seq_record.id in finder_lc:
lc_predictions_contigs.append(seq_record)
elif seq_record.id in sorter_lc and seq_record.id in finder_lowestc:
lc_predictions_contigs.append(sorter_lc.get(seq_record.id).get_seq_record())
elif seq_record.id in pprmeta_lc and seq_record.id in finder_lowestc:
lc_predictions_contigs.append(seq_record)

return (
hc_predictions_contigs,
Expand All @@ -237,7 +341,7 @@ def merge_annotations(pprmeta, finder, sorter, assembly):
)


def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
def main(pprmeta, finder, sorter, sorter2, assembly, outdir, vs_cutoff, prefix=False):
"""Parse VirSorter, VirFinder and PPR-Meta outputs and merge the results."""
(
hc_contigs,
Expand All @@ -246,7 +350,7 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
sorter_hc,
sorter_lc,
sorter_prophages,
) = merge_annotations(pprmeta, finder, sorter, assembly)
) = merge_annotations(pprmeta, finder, sorter, sorter2, assembly, vs_cutoff)

at_least_one = False
name_prefix = ""
Expand All @@ -262,18 +366,26 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
"fasta",
)
at_least_one = True
else:
print('No high confidence viral contigs found, skipping output file.')

if len(lc_contigs):
SeqIO.write(
lc_contigs,
outdir / Path(name_prefix + "low_confidence_viral_contigs.fna"),
"fasta",
)
at_least_one = True
else:
print('No low confidence viral contigs found, skipping output file.')

if len(prophage_contigs):
SeqIO.write(
prophage_contigs, outdir / Path(name_prefix + "prophages.fna"), "fasta"
)
at_least_one = True
else:
print('No prophage contigs found, skipping output file.')

# VirSorter provides some metadata on each annotation
# - is circular
Expand Down Expand Up @@ -335,13 +447,29 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
help="VirSorter .fasta files (i.e. VIRSorter_cat-1.fasta)" " VirSorter output",
required=False,
)
parser.add_argument(
"-z",
"--vs2files",
dest="sorter2",
nargs="+",
help='VirSorter2 .tsv files (i.e. final-viral-{boundary,score}.tsv, final-viral-combined.fa)' " VirSorter2 output",
required=False,
)
parser.add_argument(
"-p",
"--pmout",
dest="pprmeta",
help="Absolute or relative path to PPR-Meta output file" " PPR-Meta output",
required=False,
)
parser.add_argument(
"-y",
"--vs_cutoff",
dest="vs_cutoff",
help="Cutoff to categorize sequences identified by VirSorter2 to high or low confidence (default: 0.9).",
default=0.9,
type=float,
)
parser.add_argument(
"-r",
"--prefix",
Expand All @@ -363,7 +491,9 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
args.pprmeta,
args.finder,
args.sorter,
args.sorter2,
args.assembly,
args.outdir,
args.vs_cutoff,
prefix=args.prefix,
)
2 changes: 2 additions & 0 deletions bin/write_viral_gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def aggregate_annotations(virify_annotation_files):
cds_annotations = {}

for virify_summary in virify_annotation_files:
if 'taxonomy' in virify_summary:
continue
with open(virify_summary, "r") as table_handle:
csv_reader = csv.DictReader(table_handle, delimiter="\t")
for row in csv_reader:
Expand Down
26 changes: 26 additions & 0 deletions configs/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,32 @@ process {
]
}

withName: VIRSORTER2 {
ext.args = " --use-conda-off "
// TODO remove that arg for conda env
//if (!params.conda.enabled) {
// ext.args = " "
//}

publishDir = [
[
path: "${params.output}",
saveAs: {
filename -> {
if ( filename.equals('versions.yml') ) {
return null;
}
def output_file = new File(filename);
return "${meta.id}/${params.virusdir}/${output_file.name}";
}
},
mode: params.publish_dir_mode,
failOnError: false,
pattern: "virsorter2/*.{tsv,fasta}"
]
]
}

withName: WRITE_GFF {
publishDir = [
[
Expand Down
24 changes: 24 additions & 0 deletions modules/local/get_db/virsorter2.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
process virsorter2GetDB {
label 'virsorter2'
if (params.cloudProcess) {
publishDir "${params.databases}/virsorter2/", mode: 'copy', pattern: "virsorter2-data"
}
else {
storeDir "${params.databases}/virsorter2/"
}

output:
path("virsorter2-data/db", type: 'dir')

script:
"""
# just in case there is a failed attemp before;
# remove the whole diretory specified by -d
rm -rf virsorter2-data
# download virsorter2 database and extract
wget https://osf.io/v46sc/download
mkdir virsorter2-data
tar -xzf download -C virsorter2-data
"""
}
2 changes: 2 additions & 0 deletions modules/local/help.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def helpMSG() {
${c_yellow}Databases (automatically downloaded by default):${c_reset}
--virsorter a virsorter database provided as 'virsorter/virsorter-data' [default: $params.virsorter]
--virsorter2 a virsorter2 database provided as 'virsorter2/virsorter2-data' [default: $params.virsorter2]
--virfinder a virfinder model [default: $params.virfinder]
--viphog the ViPhOG database, hmmpress'ed [default: $params.viphog]
--rvdb the RVDB, hmmpress'ed [default: $params.rvdb]
Expand All @@ -47,6 +48,7 @@ def helpMSG() {
--rvdb /path/to/your/rvdb
${c_yellow}Parameters:${c_reset}
--use_virsorter_v1 Use VirSorter_v1 instead of default VirSorter2
--evalue E-value used to filter ViPhOG hits in the ratio_evalue step [default: $params.evalue]
--prop Minimum proportion of proteins with ViPhOG annotations to provide a taxonomic assignment [default: $params.prop]
--taxthres Minimum proportion of annotated genes required for taxonomic assignment [default: $params.taxthres]
Expand Down
10 changes: 9 additions & 1 deletion modules/local/parse/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ process PARSE {
tuple val(meta), path("*.fna"), path('virsorter_metadata.tsv'), path("${meta.id}_virus_predictions.log"), optional: true

script:
if (!params.use_virsorter_v1)
"""
touch virsorter_metadata.tsv
parse_viral_pred.py -a ${fasta} -f ${virfinder} -p ${pprmeta} -z ${virsorter} &> ${meta.id}_virus_predictions.log
"""
else
"""
touch virsorter_metadata.tsv
parse_viral_pred.py -a ${fasta} -f ${virfinder} -p ${pprmeta} -s ${virsorter}/Predicted_viral_sequences/*.fasta &> ${meta.id}_virus_predictions.log
Expand All @@ -22,7 +28,7 @@ process PARSE {
/*
usage: parse_viral_pred.py [-h] -a ASSEMB -f FINDER -s SORTER [-o OUTDIR]
description: script generates three output_files: High_confidence.fasta, Low_confidence.fasta, Prophages.fasta
description: script generates three output_files: High_confidence.fna, Low_confidence.fna, Prophages.fna
optional arguments:
-h, --help show this help message and exit
Expand All @@ -38,4 +44,6 @@ process PARSE {
-o OUTDIR, --outdir OUTDIR
Absolute or relative path of directory where output
viral prediction files should be stored (default: cwd)
NOTE: only outputs .fna files if some low confidence or high confidence viral seqs were identified, otherwise outputs nothing.
*/
19 changes: 19 additions & 0 deletions modules/local/utils.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
process CONCATENATE_VIRSORTER2_FILES {
tag "${meta.id}"
label "process_medium"

input:
tuple val(meta), path(input_files, stageAs: "inputs/*")
val output_name

output:
tuple val(meta), path("${output_name}"), emit: concatenated_result

script:
"""
export first_file=\$(ls inputs | head -n 1);
grep 'seqname' inputs/\${first_file} > header.tsv || true
cat inputs/* | grep -v 'seqname' > without_header.${output_name}
cat header.tsv without_header.${output_name} > ${output_name}
"""
}
Loading

0 comments on commit ac06be8

Please sign in to comment.